rescaler_mips32: disable ImportRowShrink

this function is failing the 'accum == 0' assert on skia bots for rescaling to 13x13 BUG=skia:6682 Change-Id: I9f9f3adf28cec63ad6e38ed3128f18825d5b70cc
2017-06-02 19:58:33 -07:00
93 changed files with 2301 additions and 7259 deletions
--- a/Android.mk
+++ b/Android.mk
@@ -11,24 +11,12 @@ ifeq ($(APP_OPTIM),release)
  endif
 endif

-# mips32 fails to build with clang from r14b
-# https://bugs.chromium.org/p/webp/issues/detail?id=343
-ifeq ($(findstring clang,$(NDK_TOOLCHAIN_VERSION)),clang)
-  ifeq ($(TARGET_ARCH),mips)
-    clang_version := $(shell $(TARGET_CC) --version)
-    ifneq ($(findstring clang version 3,$(clang_version)),)
-      WEBP_CFLAGS += -no-integrated-as
-    endif
-  endif
-endif
-
 ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
  # specifically.
  NEON := c.neon
  USE_CPUFEATURES := yes
-  WEBP_CFLAGS += -DHAVE_CPU_FEATURES_H
 else
  NEON := c
 endif
@@ -91,7 +79,6 @@ dsp_dec_srcs := \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_mips_dsp_r2.c \
-    src/dsp/yuv_neon.$(NEON) \
    src/dsp/yuv_sse2.c \

 dsp_enc_srcs := \
@@ -114,13 +101,10 @@ dsp_enc_srcs := \
    src/dsp/lossless_enc_neon.$(NEON) \
    src/dsp/lossless_enc_sse2.c \
    src/dsp/lossless_enc_sse41.c \
-    src/dsp/ssim.c \
-    src/dsp/ssim_sse2.c \

 enc_srcs := \
    src/enc/alpha_enc.c \
    src/enc/analysis_enc.c \
-    src/enc/backward_references_cost_enc.c \
    src/enc/backward_references_enc.c \
    src/enc/config_enc.c \
    src/enc/cost_enc.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,21 +3,13 @@ cmake_minimum_required(VERSION 2.8.7)
 project(libwebp C)

 # Options for coder / decoder executables.
-option(WEBP_ENABLE_SIMD "Enable any SIMD optimization." ON)
-option(WEBP_ENABLE_WASM "Enable WebAssembly optimizations." OFF)
 option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
 option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
 option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." OFF)
 option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." OFF)
-option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." OFF)
-option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
 option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces." OFF)

-if(WEBP_BUILD_WEBP_JS OR WEBP_ENABLE_WASM)
-  set(WEBP_ENABLE_SIMD OFF)
-endif()
-
 set(WEBP_DEP_LIBRARIES)
 set(WEBP_DEP_INCLUDE_DIRS)

@@ -29,18 +21,11 @@ endif()

 include(cmake/config.h.cmake)

-# Extract the version of the library.
-file(READ ${CMAKE_CURRENT_SOURCE_DIR}/configure.ac SOURCE_FILE)
-string(REGEX MATCH "[0-9.]+" WEBP_VERSION ${SOURCE_FILE})
-
 ################################################################################
 # Options.
 if(WEBP_ENABLE_SWAP_16BIT_CSP)
  add_definitions(-DWEBP_SWAP_16BIT_CSP)
 endif()
-if(WEBP_ENABLE_WASM)
-  add_definitions(-DWEBP_USE_WASM)
-endif()

 ################################################################################
 # Android only.
@@ -54,110 +39,48 @@ if(ANDROID)
  set(WEBP_DEP_INCLUDE_DIRS ${WEBP_DEP_INCLUDE_DIRS}
    ${ANDROID_NDK}/sources/android/cpufeatures
  )
-  add_definitions(-DHAVE_CPU_FEATURES_H)
 endif()

 ################################################################################
 # WebP source files.
 # Read the Makefile.am to get the source files.

-# We expect the Makefiles to define the sources as defined in
-# the first regex. E.g.:
-# libimagedec_la_SOURCES  = image_dec.c image_dec.h
-function(parse_Makefile_am FOLDER VAR SRC_REGEX)
+function(parse_Makefile_am FOLDER VAR)
  file(READ ${FOLDER}/Makefile.am MAKEFILE_AM)
-  string(REGEX MATCHALL "${SRC_REGEX}_SOURCES[ ]*\\+?=[ ]+[0-9a-z\\._ ]*"
+  string(REGEX MATCHALL "_SOURCES \\+= [^\n]*"
    FILES_PER_LINE ${MAKEFILE_AM}
  )
  set(SRCS ${${VAR}})
  foreach(FILES ${FILES_PER_LINE})
-    string(FIND ${FILES} "=" OFFSET)
-    math(EXPR OFFSET "${OFFSET} + 2")
-    string(SUBSTRING ${FILES} ${OFFSET} -1 FILES)
-    if(FILES)
-      string(REGEX MATCHALL "[0-9a-z\\._]+"
-        FILES ${FILES}
-      )
-      foreach(FILE ${FILES})
-        list(APPEND SRCS ${FOLDER}/${FILE})
-      endforeach()
-    endif()
+    string(SUBSTRING ${FILES} 12 -1 FILES)
+    string(REGEX MATCHALL "[0-9a-z\\._]+"
+      FILES ${FILES}
+    )
+    foreach(FILE ${FILES})
+      list(APPEND SRCS ${FOLDER}/${FILE})
+    endforeach()
  endforeach()
  set(${VAR} ${SRCS} PARENT_SCOPE)
 endfunction()

-set(WEBP_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
-parse_Makefile_am(${WEBP_SRC_DIR}/dec "WEBP_DEC_SRCS" "")
-parse_Makefile_am(${WEBP_SRC_DIR}/demux "WEBP_DEMUX_SRCS" "")
-parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_COMMON_SRCS" "COMMON")
-parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_ENC_SRCS" "ENC")
-parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_ENC_SRCS" "dsp_[^ ]*")
-parse_Makefile_am(${WEBP_SRC_DIR}/dsp "WEBP_DSP_DEC_SRCS" "decode_[^ ]*")
-parse_Makefile_am(${WEBP_SRC_DIR}/enc "WEBP_ENC_SRCS" "")
-parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_COMMON_SRCS" "COMMON")
-parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_ENC_SRCS" "ENC")
-parse_Makefile_am(${WEBP_SRC_DIR}/utils "WEBP_UTILS_DEC_SRCS" "decode_[^ ]*")
+set(WEBP_SRCS)
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dec "WEBP_SRCS")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/demux "WEBP_SRCS")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/dsp "WEBP_SRCS")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/enc "WEBP_SRCS")
+parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/utils "WEBP_SRCS")

 # Remove the files specific to SIMD we don't use.
 foreach(FILE ${WEBP_SIMD_FILES_NOT_TO_INCLUDE})
-  list(REMOVE_ITEM WEBP_DSP_ENC_SRCS ${FILE})
-  list(REMOVE_ITEM WEBP_DSP_DEC_SRCS ${FILE})
+  list(REMOVE_ITEM WEBP_SRCS ${FILE})
 endforeach()

-### Define the mandatory libraries.
-# Build the webpdecoder library.
+# Build the library.
 add_definitions(-Wall)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ ${WEBP_DEP_INCLUDE_DIRS})
-add_library(webpdecode OBJECT ${WEBP_DEC_SRCS})
-add_library(webpdspdecode OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS})
-add_library(webputilsdecode OBJECT ${WEBP_UTILS_COMMON_SRCS}
-  ${WEBP_UTILS_DEC_SRCS})
-add_library(webpdecoder $<TARGET_OBJECTS:webpdecode>
-  $<TARGET_OBJECTS:webpdspdecode> $<TARGET_OBJECTS:webputilsdecode>)
-target_link_libraries(webpdecoder ${WEBP_DEP_LIBRARIES})
-
-# Build the webp library.
-add_library(webpencode OBJECT ${WEBP_ENC_SRCS})
-add_library(webpdsp OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS}
-  ${WEBP_DSP_ENC_SRCS})
-add_library(webputils OBJECT ${WEBP_UTILS_COMMON_SRCS} ${WEBP_UTILS_DEC_SRCS}
-  ${WEBP_UTILS_ENC_SRCS})
-add_library(webp $<TARGET_OBJECTS:webpdecode> $<TARGET_OBJECTS:webpdsp>
-  $<TARGET_OBJECTS:webpencode> $<TARGET_OBJECTS:webputils>)
+add_library(webp ${WEBP_SRCS})
 target_link_libraries(webp ${WEBP_DEP_LIBRARIES})

-# Make sure the OBJECT libraries are built with position independent code
-# (it is not ON by default).
-set_target_properties(webpdecode webpdspdecode webputilsdecode
-  webpencode webpdsp webputils PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-# Build the webp demux library.
-add_library(webpdemux ${WEBP_DEMUX_SRCS})
-target_link_libraries(webpdemux webp)
-
-# Set the version numbers.
-function(parse_version FILE NAME VAR)
-  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/src/${FILE} SOURCE_FILE)
-  string(REGEX MATCH "${NAME}_la_LDFLAGS[^\n]* -version-info [0-9:]+" TMP
-    ${SOURCE_FILE})
-  string(REGEX MATCH "[0-9:]+" TMP ${TMP})
-  string(REGEX REPLACE ":" "." VERSION ${TMP})
-  set(${VAR} "${VERSION}" PARENT_SCOPE)
-endfunction()
-parse_version(Makefile.am webp WEBP_WEBP_SOVERSION)
-set_target_properties(webp PROPERTIES VERSION ${WEBP_VERSION}
-  SOVERSION ${WEBP_WEBP_SOVERSION})
-parse_version(Makefile.am webpdecoder WEBP_DECODER_SOVERSION)
-set_target_properties(webpdecoder PROPERTIES VERSION ${WEBP_VERSION}
-  SOVERSION ${WEBP_DECODER_SOVERSION})
-parse_version(demux/Makefile.am webpdemux WEBP_DEMUX_SOVERSION)
-set_target_properties(webpdemux PROPERTIES VERSION ${WEBP_VERSION}
-  SOVERSION ${WEBP_DEMUX_SOVERSION})
-
-# Define the libraries to install.
-set(INSTALLED_LIBRARIES webpdecoder webp webpdemux)
-
-### Deal with SIMD.
 # Change the compile flags for SIMD files we use.
 list(LENGTH WEBP_SIMD_FILES_TO_INCLUDE WEBP_SIMD_FILES_TO_INCLUDE_LENGTH)
 math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
@@ -167,176 +90,100 @@ math(EXPR WEBP_SIMD_FILES_TO_INCLUDE_RANGE
 foreach(I_FILE RANGE ${WEBP_SIMD_FILES_TO_INCLUDE_RANGE})
  list(GET WEBP_SIMD_FILES_TO_INCLUDE ${I_FILE} FILE)
  list(GET WEBP_SIMD_FLAGS_TO_INCLUDE ${I_FILE} SIMD_COMPILE_FLAG)
-  if(NOT ${SIMD_COMPILE_FLAG} STREQUAL "NOTFOUND")
-    set_source_files_properties(${FILE} PROPERTIES
-      COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
-    )
-  endif()
+  set_source_files_properties(${FILE} PROPERTIES
+    COMPILE_FLAGS ${SIMD_COMPILE_FLAG}
+  )
 endforeach()

 # Build the executables if asked for.
 if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
-   WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP OR WEBP_BUILD_WEBP_JS)
+   WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
  # Example utility library.
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "EXAMPLEUTIL_SRCS"
-    "example_util_[^ ]*")
-  list(APPEND EXAMPLEUTIL_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
-  add_library(exampleutil ${EXAMPLEUTIL_SRCS})
+  set(exampleutil_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/example_util.h)
+  add_library(exampleutil ${exampleutil_SRCS})
+  target_link_libraries(exampleutil webp ${WEBP_DEP_LIBRARIES})

-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEIOUTILS_SRCS"
-    "imageio_util_[^ ]*")
-  add_library(imageioutil ${IMAGEIOUTILS_SRCS})
-  target_link_libraries(imageioutil webp)
+  set(imageioutil_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/imageio_util.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/imageio_util.h)
+  add_library(imageioutil ${imageioutil_SRCS})
+  target_link_libraries(imageioutil ${WEBP_DEP_LIBRARIES})

  # Image-decoding utility library.
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEDEC_SRCS"
-    "imagedec_[^ ]*")
-  add_library(imagedec ${IMAGEDEC_SRCS})
-  target_link_libraries(imagedec imageioutil webp ${WEBP_DEP_IMG_LIBRARIES})
+  set(imagedec_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/gifdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/gifdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_dec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_dec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/jpegdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/jpegdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/metadata.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/metadata.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/pngdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/pngdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/tiffdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/tiffdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/webpdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/webpdec.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/wicdec.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/wicdec.h)
+  add_library(imagedec ${imagedec_SRCS})
+  target_link_libraries(imagedec webp ${WEBP_DEP_LIBRARIES}
+    ${WEBP_DEP_IMG_LIBRARIES})

  # Image-encoding utility library.
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEENC_SRCS"
-    "imageenc_[^ ]*")
-  add_library(imageenc ${IMAGEENC_SRCS})
-  target_link_libraries(imageenc webp)
+  set(imageenc_SRCS
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_enc.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/imageio/image_enc.h)
+  add_library(imageenc ${imageenc_SRCS})
+  target_link_libraries(imageenc webp imageioutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES})
 endif()

 if(WEBP_BUILD_DWEBP)
  # dwebp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "DWEBP_SRCS"
-    "dwebp")
-  add_executable(dwebp ${DWEBP_SRCS})
-  target_link_libraries(dwebp exampleutil imagedec imageenc webpdecoder)
-  install(TARGETS dwebp RUNTIME DESTINATION bin)
+  add_executable(dwebp
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/dwebp.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
+  target_link_libraries(dwebp imagedec imageenc webp
+    exampleutil imageioutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
+  )
 endif()

 if(WEBP_BUILD_CWEBP)
  # cwebp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "CWEBP_SRCS"
-    "cwebp")
-  add_executable(cwebp ${CWEBP_SRCS})
-  target_link_libraries(cwebp exampleutil imagedec webp)
-  install(TARGETS cwebp RUNTIME DESTINATION bin)
-endif()
-
-if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "WEBP_MUX_SRCS"
-    "")
-  add_library(webpmux ${WEBP_MUX_SRCS})
-  target_link_libraries(webpmux webp)
-  parse_version(mux/Makefile.am webpmux WEBP_MUX_SOVERSION)
-  set_target_properties(webpmux PROPERTIES VERSION ${WEBP_VERSION}
-    SOVERSION ${WEBP_MUX_SOVERSION})
-  list(APPEND INSTALLED_LIBRARIES webpmux)
+  add_executable(cwebp
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/cwebp.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
+  target_link_libraries(cwebp imagedec webp exampleutil imageioutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
+  )
 endif()

 if(WEBP_BUILD_GIF2WEBP)
  # gif2webp
-  include_directories(${WEBP_DEP_GIF_INCLUDE_DIRS})
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "GIF2WEBP_SRCS"
-    "gif2webp")
+  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
+  set(GIF2WEBP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/examples/gif2webp.c)
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "GIF2WEBP_SRCS")
  add_executable(gif2webp ${GIF2WEBP_SRCS})
-  target_link_libraries(gif2webp exampleutil imageioutil webp webpmux
-    ${WEBP_DEP_GIF_LIBRARIES})
-  install(TARGETS gif2webp RUNTIME DESTINATION bin)
+  target_link_libraries(gif2webp imagedec webp exampleutil imageioutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
+  )
 endif()

 if(WEBP_BUILD_IMG2WEBP)
  # img2webp
  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "IMG2WEBP_SRCS"
-    "img2webp")
+  set(IMG2WEBP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/examples/img2webp.c)
+  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/src/mux "IMG2WEBP_SRCS")
  add_executable(img2webp ${IMG2WEBP_SRCS})
-  target_link_libraries(img2webp exampleutil imagedec imageioutil webp webpmux)
-  install(TARGETS img2webp RUNTIME DESTINATION bin)
+  target_link_libraries(img2webp imagedec webp exampleutil imageioutil
+    ${WEBP_DEP_LIBRARIES} ${WEBP_DEP_IMG_LIBRARIES}
+  )
 endif()
-
-if (WEBP_BUILD_WEBPINFO)
-  # webpinfo
-  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
-  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "WEBPINFO_SRCS"
-    "webpinfo")
-  add_executable(webpinfo ${WEBPINFO_SRCS})
-  target_link_libraries(webpinfo exampleutil imageioutil)
-  install(TARGETS webpinfo RUNTIME DESTINATION bin)
-endif()
-
-if(WEBP_BUILD_WEBP_JS)
-  # JavaScript version
-  add_executable(webp_js
-                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
-  target_link_libraries(webp_js webpdecoder SDL)
-  set_target_properties(webp_js PROPERTIES LINK_FLAGS
-      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
-  set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
-  target_compile_definitions(webp_js PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)
-
-  # WASM version
-  add_executable(webp_wasm
-                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
-  target_link_libraries(webp_wasm webpdecoder SDL)
-  set_target_properties(webp_wasm PROPERTIES LINK_FLAGS
-      "-s WASM=1 -s 'BINARYEN_METHOD=\"native-wasm\"' \
-      -s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
-  target_compile_definitions(webp_wasm PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)
-
-  target_compile_definitions(webpdecoder PUBLIC EMSCRIPTEN)
-endif()
-
-# Install the different headers and libraries.
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h
-        DESTINATION include/webp)
-install(TARGETS ${INSTALLED_LIBRARIES}
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
-
-# Create the CMake version file.
-include(CMakePackageConfigHelpers)
-write_basic_package_version_file(
-  "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
-  VERSION ${WEBP_VERSION}
-  COMPATIBILITY AnyNewerVersion
-)
-
-# Create the Config file.
-include(CMakePackageConfigHelpers)
-set(ConfigPackageLocation share/WebP/cmake/)
-configure_package_config_file(
-  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/WebPConfig.cmake.in
-  ${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake
-  INSTALL_DESTINATION ${ConfigPackageLocation}
-)
-
-# Install the generated CMake files.
-install(
-  FILES "${CMAKE_CURRENT_BINARY_DIR}/WebPConfigVersion.cmake"
-        "${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake"
-  DESTINATION ${ConfigPackageLocation}
-)
-
-# Install the man pages.
-set(MAN_PAGES cwebp.1 dwebp.1 gif2webp.1 img2webp.1 vwebp.1 webpmux.1
-  webpinfo.1)
-set(EXEC_BUILDS "CWEBP" "DWEBP" "GIF2WEBP" "IMG2WEBP" "VWEBP" "WEBPMUX"
-  "WEBPINFO")
-list(LENGTH MAN_PAGES MAN_PAGES_LENGTH)
-math(EXPR MAN_PAGES_RANGE "${MAN_PAGES_LENGTH} - 1")
-
-foreach(I_MAN RANGE ${MAN_PAGES_RANGE})
-  list(GET EXEC_BUILDS ${I_MAN} EXEC_BUILD)
-  if(WEBP_BUILD_${EXEC_BUILD})
-    list(GET MAN_PAGES ${I_MAN} MAN_PAGE)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/man/${MAN_PAGE}
-      DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1
-      COMPONENT doc
-    )
-  endif()
-endforeach()
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -229,7 +229,6 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
    $(DIROBJ)\dsp\yuv_mips_dsp_r2.obj \
-    $(DIROBJ)\dsp\yuv_neon.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \

 DSP_ENC_OBJS = \
@@ -255,8 +254,6 @@ DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\lossless_enc_neon.obj \
    $(DIROBJ)\dsp\lossless_enc_sse2.obj \
    $(DIROBJ)\dsp\lossless_enc_sse41.obj \
-    $(DIROBJ)\dsp\ssim.obj \
-    $(DIROBJ)\dsp\ssim_sse2.obj \

 EX_ANIM_UTIL_OBJS = \
    $(DIROBJ)\examples\anim_util.obj \
@@ -266,7 +263,6 @@ IMAGEIO_DEC_OBJS = \
    $(DIROBJ)\imageio\jpegdec.obj \
    $(DIROBJ)\imageio\metadata.obj \
    $(DIROBJ)\imageio\pngdec.obj \
-    $(DIROBJ)\imageio\pnmdec.obj \
    $(DIROBJ)\imageio\tiffdec.obj \
    $(DIROBJ)\imageio\webpdec.obj \
    $(DIROBJ)\imageio\wicdec.obj \
@@ -283,7 +279,6 @@ EX_UTIL_OBJS = \
 ENC_OBJS = \
    $(DIROBJ)\enc\alpha_enc.obj \
    $(DIROBJ)\enc\analysis_enc.obj \
-    $(DIROBJ)\enc\backward_references_cost_enc.obj \
    $(DIROBJ)\enc\backward_references_enc.obj \
    $(DIROBJ)\enc\config_enc.obj \
    $(DIROBJ)\enc\cost_enc.obj \
@@ -349,8 +344,7 @@ all: ex
 OUT_EXAMPLES = $(DIRBIN)\cwebp.exe $(DIRBIN)\dwebp.exe
 EXTRA_EXAMPLES = $(DIRBIN)\vwebp.exe $(DIRBIN)\webpmux.exe \
                 $(DIRBIN)\img2webp.exe $(DIRBIN)\get_disto.exe \
-                 $(DIRBIN)\webp_quality.exe $(DIRBIN)\vwebp_sdl.exe \
-                 $(DIRBIN)\webpinfo.exe
+                 $(DIRBIN)\webp_quality.exe

 ex: $(OUT_LIBS) $(OUT_EXAMPLES)
 all: ex $(EXTRA_EXAMPLES)
@@ -372,9 +366,6 @@ $(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBPMUX)
 $(DIRBIN)\gif2webp.exe: $(LIBWEBP)
 $(DIRBIN)\vwebp.exe: $(DIROBJ)\examples\vwebp.obj $(EX_UTIL_OBJS)
 $(DIRBIN)\vwebp.exe: $(IMAGEIO_UTIL_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
-$(DIRBIN)\vwebp_sdl.exe: $(DIROBJ)\extras\vwebp_sdl.obj
-$(DIRBIN)\vwebp_sdl.exe: $(DIROBJ)\extras\webp_to_sdl.obj
-$(DIRBIN)\vwebp_sdl.exe: $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\img2webp.exe: $(DIROBJ)\examples\img2webp.obj $(LIBWEBPMUX)
@@ -382,12 +373,10 @@ $(DIRBIN)\img2webp.exe: $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\img2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\get_disto.exe: $(DIROBJ)\extras\get_disto.obj
 $(DIRBIN)\get_disto.exe: $(IMAGEIO_DEC_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
+
 $(DIRBIN)\webp_quality.exe: $(DIROBJ)\extras\webp_quality.obj
 $(DIRBIN)\webp_quality.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\webp_quality.exe: $(EXTRAS_OBJS) $(LIBWEBP)
-$(DIRBIN)\webpinfo.exe: $(DIROBJ)\examples\webpinfo.obj
-$(DIRBIN)\webpinfo.exe: $(IMAGEIO_DEC_OBJS)
-$(DIRBIN)\webpinfo.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)

 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS): $(OUTPUT_DIRS)
--- a/15
+++ b/15
@@ -113,8 +113,8 @@ make install

 CMake:
 ------
-With CMake, you can compile libwebp, cwebp, dwebp, gif2web, img2webp and the
-JS bindings.
+The support for CMake is minimal: it only helps you compile libwebp, cwebp and
+dwebp.

 Prerequisites:
 A compiler (e.g., gcc with autotools) and CMake.
@@ -123,25 +123,18 @@ minimal build:
 $ sudo apt-get install build-essential cmake

 When building from git sources, you will need to run cmake to generate the
-makefiles.
+configure script.

 mkdir build && cd build && cmake ../
 make
 make install

-If you also want any of the executables, you will need to enable them through
-CMake, e.g.:
+If you also want cwebp or dwebp, you will need to enable them through CMake:

 cmake -DWEBP_BUILD_CWEBP=ON -DWEBP_BUILD_DWEBP=ON ../

 or through your favorite interface (like ccmake or cmake-qt-gui).

-Finally, once installed, you can also use WebP in your CMake project by doing:
-
-find_package(WebP)
-
-which will define the CMake variables WebP_INCLUDE_DIRS and WebP_LIBRARIES.
-
 Gradle:
 -------
 The support for Gradle is minimal: it only helps you compile libwebp, cwebp and
--- a/README.wasm
+++ b/README.wasm
@@ -1,91 +0,0 @@
-Description:
-============
-
-This file describes the compilation of libwebp using portable intrinsics /
-WebAssembly (wasm) to native targets using clang and CMake.
-
-Prerequisites:
-==============
-
- cmake 2.8+
-
- clang 3.9+ for portable intrinsics support; as wasm progresses a tip of tree
-  build may be necessary.
-
-Building:
-=========
-
- - configure the project with CMake using:
-
- $ mkdir -p build && \
-   cd build && \
-   cmake -DWEBP_BUILD_DWEBP=1 -DCMAKE_C_COMPILER=clang -DWEBP_ENABLE_WASM=1 ../
-
- - compile dwebp using 'make'.
-
- - Note this currently generates native executables only and is incompatible
-   with -DWEBP_BUILD_WEBP_JS.
-
-Build options:
-==============
-
- platform specific multiply high (mulhi) implementation, disabled by default.
-  arm: -DCMAKE_C_FLAGS='-DENABLE_NEON_BUILTIN_MULHI_INT16X8 ...'
-  x86: -DCMAKE_C_FLAGS='-DENABLE_X86_BUILTIN_MULHI_INT16X8 ...'
-
-Cross compilation:
-==================
-
- - arm toolchains can be obtained from:
-   http://www.linaro.org/downloads/
-
- - the android ndk can be obtained from:
-   https://developer.android.com/ndk/downloads/index.html
-
-armv7:
------
-
-Android:
- $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
-   --arch arm --api 24 --stl gnustl --install-dir /opt/android-arm-24
- $ mkdir -p build && cd build
- $ cmake ../libwebp \
-   -DWEBP_BUILD_DWEBP=1 \
-   -DCMAKE_C_COMPILER=/opt/android-arm-24/bin/clang \
-   -DCMAKE_PREFIX_PATH=/opt/android-arm-24/sysroot/usr/lib \
-   -DCMAKE_C_FLAGS=-fPIE \
-   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
-   -DCMAKE_BUILD_TYPE=Release \
-   -DWEBP_ENABLE_WASM=1
-
-Linux:
- $ gcc_arm=/opt/gcc-arm; target=arm-linux-gnueabihf
- $ mkdir -p build && cd build
- $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
-   -DCMAKE_C_COMPILER=clang \
-   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc -march=armv7-a -mfpu=neon" \
-   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
-
-aarch64 / arm64:
----------------
-
-Android:
- $ ./android-ndk-r15b/build/tools/make_standalone_toolchain.py \
-   --arch arm64 --api 24 --stl gnustl --install-dir /opt/android-arm64-24
- $ mkdir -p build && cd build
- $ cmake ../libwebp \
-   -DWEBP_BUILD_DWEBP=1 \
-   -DCMAKE_C_COMPILER=/opt/android-arm64-24/bin/clang \
-   -DCMAKE_PREFIX_PATH=/opt/android-arm64-24/sysroot/usr/lib \
-   -DCMAKE_C_FLAGS=-fPIE \
-   -DCMAKE_EXE_LINKER_FLAGS=-Wl,-pie \
-   -DCMAKE_BUILD_TYPE=Release \
-   -DWEBP_ENABLE_WASM=1
-
-Linux:
- $ gcc_arm=/opt/gcc-aarch64; target=aarch64-linux-gnu
- $ mkdir -p build && cd build
- $ cmake ../libwebp -DWEBP_BUILD_DWEBP=1 -DWEBP_ENABLE_WASM=1 \
-   -DCMAKE_C_COMPILER=clang \
-   -DCMAKE_C_FLAGS="--target=$target --gcc-toolchain=$gcc_arm --sysroot=$gcc_arm/$target/libc" \
-   -DCMAKE_PREFIX_PATH=$gcc_arm/$target/libc/usr
--- a/README.webp_js
+++ b/README.webp_js
@@ -1,80 +0,0 @@
-     __   __ ____ ____ ____     __  ____
-    /  \\/  \  _ \  _ \  _ \   (__)/  __\
-    \       /  __/ _  \  __/   _)  \_   \
-     \__\__/_____/____/_/     /____/____/
-
-Description:
-============
-
-This file describes the compilation of libwebp into a JavaScript decoder
-using Emscripten and CMake.
-
- - install the Emscripten SDK following the procedure described at:
-   https://kripken.github.io/emscripten-site/docs/getting_started/downloads.html
-   After installation, you should have some global variable positioned to the
-   location of the SDK. In particular, $EMSCRIPTEN should point to the
-   top-level directory containing Emscripten tools.
-
- - make sure the file $EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake is
-   accessible. This is the toolchain file used by CMake to invoke Emscripten.
-
- - configure the project 'WEBP_JS' with CMake using:
-
- cd webp_js && \
- cmake -DWEBP_BUILD_WEBP_JS=ON \
-       -DEMSCRIPTEN_GENERATE_BITCODE_STATIC_LIBRARIES=1 \
-       -DCMAKE_TOOLCHAIN_FILE=$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake \
-       ../
-
- - compile webp.js using 'make'.
-
- - that's it! Upon completion, you should have the webp.js and
-   webp.js.mem files generated.
-
- - Note this generates both webp_js and webp_wasm without any SIMD enabled due
-   to bugs with this toolchain associated with the SSE2 code.
-   -DWEBP_ENABLE_WASM is currently meant to generate native (x86, arm)
-   executables (dwebp, cwebp) and is incompatible with -DWEBP_BUILD_WEBP_JS.
-
-The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
-bitstream into a canvas. See webp_js/index.html for a simple usage sample.
-
-Demo HTML page:
-===============
-
-   The HTML page webp_js/index.html requires an HTTP server to serve the WebP
-   image example. It's easy to just use Python for that.
-
-cd webp_js && python -m SimpleHTTPServer 8080
-
-and then navigate to http://localhost:8080 in your favorite browser.
-
-
-Web-Assembly (WASM) version:
-============================
-
-  CMakeLists.txt is configured to build the WASM version when using
-  the option WEBP_BUILD_WEBP_JS=ON. The compilation step will assemble
-  the files 'webp_wasm.js', 'webp_wasm.wasm' in the webp_js/ directory.
-  See webp_js/index_wasm.html for a simple demo page using the WASM version
-  of the library.
-
-  You will need a fairly recent version of Emscripten (at least 1.37.8) and of
-  your WASM-enabled browser to run this version. Consider it very experimental!
-
-Caveat:
-=======
-
-  - First decoding using the library is usually slower, due to just-in-time
-    compilation.
-
-  - Some versions of llvm produce the following compile error when SSE2 is
-    enabled.
-
-"Unsupported:   %516 = bitcast <8 x i16> %481 to i128
- LLVM ERROR: BitCast Instruction not yet supported for integer types larger than 64 bits"
-
-    The corresponding Emscripten bug is at:
-    https://github.com/kripken/emscripten/issues/3788
-
-    Therefore, SSE2 optimization is currently disabled in CMakeLists.txt.
--- a/build.gradle
+++ b/build.gradle
@@ -74,17 +74,9 @@ model {
          cCompiler.args "-frename-registers -s"
        }
      }
-      // mips32 fails to build with clang from r14b
-      // https://bugs.chromium.org/p/webp/issues/detail?id=343
-      if (toolChain in Clang) {
-        if (getTargetPlatform() == "mips") {
-          cCompiler.args "-no-integrated-as"
-        }
-      }
      // Check for NEON usage.
      if (getTargetPlatform() == "arm" || getTargetPlatform() == "arm64") {
        NEON = "c.neon"
-        cCompiler.define "HAVE_CPU_FEATURES_H"
      } else {
        NEON = "c"
      }
@@ -156,7 +148,6 @@ model {
            include "yuv.c"
            include "yuv_mips32.c"
            include "yuv_mips_dsp_r2.c"
-            include "yuv_neon.$NEON"
            include "yuv_sse2.c"
            srcDir "src/utils"
            include "bit_reader_utils.c"
@@ -188,12 +179,9 @@ model {
            include "lossless_enc_neon.$NEON"
            include "lossless_enc_sse2.c"
            include "lossless_enc_sse41.c"
-            include "ssim.c"
-            include "ssim_sse2.c"
            srcDir "src/enc"
            include "alpha_enc.c"
            include "analysis_enc.c"
-            include "backward_references_cost_enc.c"
            include "backward_references_enc.c"
            include "config_enc.c"
            include "cost_enc.c"
@@ -300,7 +288,6 @@ model {
            include "jpegdec.c"
            include "metadata.c"
            include "pngdec.c"
-            include "pnmdec.c"
            include "tiffdec.c"
            include "webpdec.c"
          }
@@ -402,24 +389,6 @@ model {
        }
      }
    }
-
-    webpinfo_example(NativeExecutableSpec) {
-      binaries {
-        all {
-          lib library: "example_util", linkage: "static"
-          lib library: "imageio_util", linkage: "static"
-          lib library: "webp"
-        }
-      }
-      sources {
-        c {
-          source {
-            srcDir "./examples"
-            include "webpinfo.c"
-          }
-        }
-      }
-    }
  }
  tasks {
    // Task to test all possible configurations.
--- a/cmake/WebPConfig.cmake.in
+++ b/cmake/WebPConfig.cmake.in
@@ -1,6 +0,0 @@
-@PACKAGE_INIT@
-
-set(WebP_INCLUDE_DIRS "webp")
-set(WEBP_INCLUDE_DIRS ${WebP_INCLUDE_DIRS})
-set(WebP_LIBRARIES "@INSTALLED_LIBRARIES@")
-set(WEBP_LIBRARIES "${WebP_LIBRARIES}")
--- a/cmake/config.h.cmake
+++ b/cmake/config.h.cmake
@@ -65,7 +65,7 @@ endif()
 # Find the standard image libraries.
 set(WEBP_DEP_IMG_LIBRARIES)
 set(WEBP_DEP_IMG_INCLUDE_DIRS)
-foreach(I_LIB PNG JPEG TIFF)
+foreach(I_LIB PNG JPEG TIFF GIF)
  find_package(${I_LIB})
  set(WEBP_HAVE_${I_LIB} ${${I_LIB}_FOUND})
  if(${I_LIB}_FOUND)
@@ -74,16 +74,6 @@ foreach(I_LIB PNG JPEG TIFF)
  endif()
 endforeach()

-# GIF detection, gifdec isn't part of the imageio lib.
-set(WEBP_DEP_GIF_LIBRARIES)
-set(WEBP_DEP_GIF_INCLUDE_DIRS)
-find_package(GIF)
-set(WEBP_HAVE_GIF ${GIF_FOUND})
-if(GIF_FOUND)
-  list(APPEND WEBP_DEP_GIF_LIBRARIES ${GIF_LIBRARIES})
-  list(APPEND WEBP_DEP_GIF_INCLUDE_DIRS ${GIF_INCLUDE_DIR})
-endif()
-
 ## Check for specific headers.
 include(CheckIncludeFiles)
 check_include_files("stdlib.h;stdarg.h;string.h;float.h" STDC_HEADERS)
--- a/cmake/cpu.cmake
+++ b/cmake/cpu.cmake
@@ -1,11 +1,6 @@
 ## Check for SIMD extensions.

-function(webp_check_compiler_flag WEBP_SIMD_FLAG ENABLE_SIMD)
-  if(NOT ENABLE_SIMD)
-    message(STATUS "Disabling ${WEBP_SIMD_FLAG} optimization.")
-    set(WEBP_HAVE_${WEBP_SIMD_FLAG} 0 PARENT_SCOPE)
-    return()
-  endif()
+function(webp_check_compiler_flag WEBP_SIMD_FLAG)
  unset(WEBP_HAVE_FLAG_${WEBP_SIMD_FLAG} CACHE)
  check_c_source_compiles("
      #include \"${CMAKE_CURRENT_LIST_DIR}/../src/dsp/dsp.h\"
@@ -61,11 +56,11 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
  # (especially on Android).
  unset(WEBP_HAVE_${WEBP_SIMD_FLAG} CACHE)
  set(CMAKE_REQUIRED_FLAGS)
-  webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
+  webp_check_compiler_flag(${WEBP_SIMD_FLAG})
  if(NOT WEBP_HAVE_${WEBP_SIMD_FLAG})
    list(GET SIMD_ENABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
    set(CMAKE_REQUIRED_FLAGS ${SIMD_COMPILE_FLAG})
-    webp_check_compiler_flag(${WEBP_SIMD_FLAG} ${WEBP_ENABLE_SIMD})
+    webp_check_compiler_flag(${WEBP_SIMD_FLAG})
  else()
    set(SIMD_COMPILE_FLAG " ")
  endif()
@@ -85,11 +80,8 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    foreach(FILE ${SIMD_FILES})
      list(APPEND WEBP_SIMD_FILES_NOT_TO_INCLUDE ${FILE})
    endforeach()
-    # Explicitly disable SIMD. Avoid this with WASM to avoid an ICE with clang:
-    # https://bugs.chromium.org/p/webp/issues/detail?id=350
-    # WASM overrides the native SIMD so building it in is harmless aside from
-    # binary size.
-    if(NOT WEBP_ENABLE_WASM AND SIMD_DISABLE_FLAGS)
+    # Explicitly disable SIMD.
+    if(SIMD_DISABLE_FLAGS)
      list(GET SIMD_DISABLE_FLAGS ${I_SIMD} SIMD_COMPILE_FLAG)
      include(CheckCCompilerFlag)
      if(SIMD_COMPILE_FLAG)
@@ -119,13 +111,3 @@ foreach(I_SIMD RANGE ${WEBP_SIMD_FLAGS_RANGE})
    endif()
  endif()
 endforeach()
-
-## Add *_wasm.c files if enabled.
-if(WEBP_ENABLE_WASM)
-  file(GLOB SIMD_FILES "${CMAKE_CURRENT_LIST_DIR}/../"
-    "src/dsp/*_wasm.c"
-  )
-  foreach(FILE ${SIMD_FILES})
-    list(APPEND WEBP_SIMD_FILES_TO_INCLUDE ${FILE})
-  endforeach()
-endif()
--- a/configure.ac
+++ b/configure.ac
@@ -67,7 +67,6 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
          CFLAGS="$SAVED_CFLAGS"])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-fvisibility=hidden])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
-TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wconstant-conversion])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wfloat-conversion])
@@ -76,7 +75,6 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat -Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wold-style-definition])
-TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wparentheses-equality])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshorten-64-to-32])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunreachable-code])
@@ -243,13 +241,9 @@ AS_IF([test "x$enable_neon" != "xno"], [
          NEON_FLAGS=""],
          [AC_DEFINE(WEBP_HAVE_NEON_RTCD, [1],
                     [Set to 1 if runtime detection of NEON is enabled])])])
-
-      case "$host_os" in
-        *android*) AC_CHECK_HEADERS([cpu-features.h]) ;;
-      esac
-      ;;
-  esac
-  AC_SUBST([NEON_FLAGS])])
+        ;;
+    esac
+    AC_SUBST([NEON_FLAGS])])

 dnl === CLEAR_LIBVARS([var_pfx])
 dnl ===   Clears <var_pfx>_{INCLUDES,LIBS}.
@@ -434,44 +428,6 @@ AS_IF([test "x$enable_gl" != "xno"], [
 ])
 AM_CONDITIONAL([BUILD_VWEBP], [test "$build_vwebp" = "yes"])

-dnl === check for SDL support ===
-
-AC_ARG_ENABLE([sdl],
-              AS_HELP_STRING([--disable-sdl],
-                             [Disable detection of SDL support
-                              @<:@default=auto@:>@]))
-AS_IF([test "x$enable_sdl" != "xno"], [
-  CLEAR_LIBVARS([SDL])
-  WITHLIB_OPTION([sdl], [SDL])
-
-  $sdl_header = "no";
-  LIBCHECK_PROLOGUE([SDL])
-  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL_SDL.h"],
-                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
-                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
-  if test x"$sdl_header" != "xno" ; then
-    AC_CHECK_LIB(SDL, SDL_Init,
-                 [SDL_LIBS="-lSDL"
-                  SDL_INCLUDES="-DWEBP_HAVE_SDL"
-                  AC_DEFINE(WEBP_HAVE_SDL, [1],
-                            [Set to 1 if SDL library is installed])
-                  sdl_support=yes
-                 ],
-                 AC_MSG_WARN(Optional SDL library not found),
-                 [$MATH_LIBS]),
-    if test x"$sdl_header" == "xSDL.h" ; then
-      SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_JUST_SDL_H"
-    fi
-  fi
-  LIBCHECK_EPILOGUE([SDL])
-
-  if test "$sdl_support" = "yes" ; then
-    build_vwebp_sdl=yes
-  fi
-])
-
-AM_CONDITIONAL([BUILD_VWEBP_SDL], [test "$build_vwebp_sdl" = "yes"])
-
 dnl === check for PNG support ===

 AC_ARG_ENABLE([png], AS_HELP_STRING([--disable-png],
@@ -605,11 +561,6 @@ if test "$enable_libwebpmux" = "yes"; then
 fi
 AM_CONDITIONAL([BUILD_IMG2WEBP], [test "${build_img2webp}" = "yes"])

-if test "$enable_libwebpmux" = "yes"; then
-  build_webpinfo=yes
-fi
-AM_CONDITIONAL([BUILD_WEBPINFO], [test "${build_webpinfo}" = "yes"])
-
 dnl === check for WIC support ===

 AC_ARG_ENABLE([wic],
@@ -767,7 +718,4 @@ gif2webp    : ${build_gif2webp-no}
 img2webp    : ${build_img2webp-no}
 webpmux     : ${enable_libwebpmux-no}
 vwebp       : ${build_vwebp-no}
-webpinfo    : ${build_webpinfo-no}
-SDL support : ${sdl_support-no}
-vwebp_sdl   : ${build_vwebp_sdl-no}
 ])
--- a/examples/Android.mk
+++ b/examples/Android.mk
@@ -80,19 +80,3 @@ LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webpmux webp
 LOCAL_MODULE := img2webp_example

 include $(BUILD_EXECUTABLE)
-
-################################################################################
-# webpinfo
-
-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
-    webpinfo.c \
-
-LOCAL_CFLAGS := $(WEBP_CFLAGS)
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
-LOCAL_STATIC_LIBRARIES := example_util imageio_util webp
-
-LOCAL_MODULE := webpinfo_example
-
-include $(BUILD_EXECUTABLE)
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -16,9 +16,6 @@ endif
 if BUILD_VWEBP
  bin_PROGRAMS += vwebp
 endif
-if BUILD_WEBPINFO
-  bin_PROGRAMS += webpinfo
-endif

 noinst_LTLIBRARIES = libexample_util.la

@@ -69,11 +66,6 @@ img2webp_LDADD += ../imageio/libimagedec.la
 img2webp_LDADD += ../src/mux/libwebpmux.la ../src/libwebp.la
 img2webp_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

-webpinfo_SOURCES = webpinfo.c
-webpinfo_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
-webpinfo_LDADD  = libexample_util.la ../imageio/libimageio_util.la
-webpinfo_LDADD += ../src/libwebp.la
-
 if BUILD_LIBWEBPDECODER
  anim_diff_LDADD += ../src/libwebpdecoder.la
  vwebp_LDADD += ../src/libwebpdecoder.la
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -332,8 +332,9 @@ int main(int argc, const char *argv[]) {
      case BMP:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
        break;
-      case TIFF:
-        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
+      case TIFF:    // note: force pre-multiplied alpha
+        output_buffer->colorspace =
+            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
        break;
      case PGM:
      case RAW_YUV:
--- a/examples/gifdec.c
+++ b/examples/gifdec.c
@@ -28,17 +28,11 @@
 #define GIF_DISPOSE_SHIFT     2

 // from utils/utils.h
-#ifdef __cplusplus
-extern "C" {
-#endif
 extern void WebPCopyPlane(const uint8_t* src, int src_stride,
                          uint8_t* dst, int dst_stride,
                          int width, int height);
 extern void WebPCopyPixels(const WebPPicture* const src,
                           WebPPicture* const dst);
-#ifdef __cplusplus
-}
-#endif

 void GIFGetBackgroundColor(const ColorMapObject* const color_map,
                           int bgcolor_index, int transparent_index,
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -378,13 +378,13 @@ static void HandleDisplay(void) {
    }
  }
  glPopMatrix();
-  glutSwapBuffers();
+  glFlush();
 }

 static void StartDisplay(void) {
  const int width = kParams.canvas_width;
  const int height = kParams.canvas_height;
-  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+  glutInitDisplayMode(GLUT_RGBA);
  glutInitWindowSize(width, height);
  glutCreateWindow("WebP viewer");
  glutDisplayFunc(HandleDisplay);
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@@ -11,14 +11,10 @@ libwebpextras_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpextras_la_LDFLAGS = -lm
 libwebpextras_la_LIBADD = ../src/libwebp.la

-noinst_PROGRAMS =
-noinst_PROGRAMS += get_disto webp_quality
-if BUILD_VWEBP_SDL
-  noinst_PROGRAMS += vwebp_sdl
-endif
+noinst_PROGRAMS = get_disto webp_quality

-get_disto_SOURCES  = get_disto.c
-get_disto_CPPFLAGS = $(AM_CPPFLAGS)
+get_disto_SOURCES = get_disto.c
+get_disto_CPPFLAGS  = $(AM_CPPFLAGS)
 get_disto_LDADD = ../imageio/libimageio_util.la ../imageio/libimagedec.la
 get_disto_LDADD += ../src/libwebp.la
 get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)
@@ -28,9 +24,3 @@ webp_quality_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 webp_quality_LDADD  = ../imageio/libimageio_util.la
 webp_quality_LDADD += libwebpextras.la
 webp_quality_LDADD += ../src/libwebp.la
-
-vwebp_sdl_SOURCES  = vwebp_sdl.c webp_to_sdl.c webp_to_sdl.h
-vwebp_sdl_CPPFLAGS = $(AM_CPPFLAGS) $(SDL_INCLUDES)
-vwebp_sdl_LDADD = ../imageio/libimageio_util.la
-vwebp_sdl_LDADD += ../src/libwebp.la
-vwebp_sdl_LDADD += $(SDL_LIBS)
--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@@ -278,7 +278,7 @@ int main(int argc, const char *argv[]) {
    goto End;
  }
  size1 = ReadPicture(name1, &pic1, 1);
-  size2 = ReadPicture(name2, &pic2, 1);
+  size2 = ReadPicture(name1, &pic2, 1);
  if (size1 == 0 || size2 == 0) goto End;

  if (!keep_alpha) {
--- a/extras/vwebp_sdl.c
+++ b/extras/vwebp_sdl.c
@@ -1,96 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Simple SDL-based WebP file viewer.
-// Does not support animation, just static images.
-//
-// Press 'q' to exit.
-//
-// Author: James Zern (jzern@google.com)
-
-#include <stdio.h>
-
-#ifdef HAVE_CONFIG_H
-#include "webp/config.h"
-#endif
-
-#if defined(WEBP_HAVE_SDL)
-
-#include "webp_to_sdl.h"
-#include "webp/decode.h"
-#include "../imageio/imageio_util.h"
-
-#if defined(WEBP_HAVE_JUST_SDL_H)
-#include <SDL.h>
-#else
-#include <SDL/SDL.h>
-#endif
-
-static void ProcessEvents(void) {
-  int done = 0;
-  SDL_Event event;
-  while (!done && SDL_WaitEvent(&event)) {
-    switch (event.type) {
-      case SDL_KEYUP:
-        switch (event.key.keysym.sym) {
-          case SDLK_q: done = 1; break;
-          default: break;
-        }
-        break;
-      default: break;
-    }
-  }
-}
-
-int main(int argc, char* argv[]) {
-  int c;
-  int ok = 0;
-  for (c = 1; c < argc; ++c) {
-    const char* file = NULL;
-    const uint8_t* webp = NULL;
-    size_t webp_size = 0;
-    if (!strcmp(argv[c], "-h")) {
-      printf("Usage: %s [-h] image.webp [more_files.webp...]\n", argv[0]);
-      return 0;
-    } else {
-      file = argv[c];
-    }
-    if (file == NULL) continue;
-    if (!ImgIoUtilReadFile(file, &webp, &webp_size)) {
-      fprintf(stderr, "Error opening file: %s\n", file);
-      goto Error;
-    }
-    if (webp_size != (size_t)(int)webp_size) {
-      fprintf(stderr, "File too large.\n");
-      goto Error;
-    }
-    ok = WebpToSDL((const char*)webp, (int)webp_size);
-    free((void*)webp);
-    if (!ok) {
-      fprintf(stderr, "Error decoding file %s\n", file);
-      goto Error;
-    }
-    ProcessEvents();
-  }
-  ok = 1;
-
- Error:
-  SDL_Quit();
-  return ok ? 0 : 1;
-}
-
-#else  // !WEBP_HAVE_SDL
-
-int main(int argc, const char *argv[]) {
-  fprintf(stderr, "SDL support not enabled in %s.\n", argv[0]);
-  (void)argc;
-  return 0;
-}
-
-#endif
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@@ -1,105 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Simple WebP-to-SDL wrapper. Useful for emscripten.
-//
-// Author: James Zern (jzern@google.com)
-
-#ifdef HAVE_CONFIG_H
-#include "webp/config.h"
-#endif
-
-#if defined(WEBP_HAVE_SDL)
-
-#include "webp_to_sdl.h"
-
-#include <stdio.h>
-#include "webp/decode.h"
-
-#if defined(WEBP_HAVE_JUST_SDL_H)
-#include <SDL.h>
-#else
-#include <SDL/SDL.h>
-#endif
-
-int WebpToSDL(const char* data, unsigned int data_size) {
-  int ok = 0;
-  VP8StatusCode status;
-  WebPDecoderConfig config;
-  WebPBitstreamFeatures* const input = &config.input;
-  WebPDecBuffer* const output = &config.output;
-  SDL_Surface* screen = NULL;
-  SDL_Surface* surface = NULL;
-
-  if (!WebPInitDecoderConfig(&config)) {
-    fprintf(stderr, "Library version mismatch!\n");
-    return 1;
-  }
-
-  SDL_Init(SDL_INIT_VIDEO);
-
-  status = WebPGetFeatures((uint8_t*)data, (size_t)data_size, &config.input);
-  if (status != VP8_STATUS_OK) goto Error;
-
-  screen = SDL_SetVideoMode(input->width, input->height, 32, SDL_SWSURFACE);
-  if (screen == NULL) {
-    fprintf(stderr, "Unable to set video mode (32bpp %dx%d)!\n",
-            input->width, input->height);
-    goto Error;
-  }
-
-  surface = SDL_CreateRGBSurface(SDL_SWSURFACE,
-                                 input->width, input->height, 32,
-                                 0x000000ffu,   // R mask
-                                 0x0000ff00u,   // G mask
-                                 0x00ff0000u,   // B mask
-                                 0xff000000u);  // A mask
-
-  if (surface == NULL) {
-    fprintf(stderr, "Unable to create %dx%d RGBA surface!\n",
-            input->width, input->height);
-    goto Error;
-  }
-  if (SDL_MUSTLOCK(surface)) SDL_LockSurface(surface);
-
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-  output->colorspace = MODE_BGRA;
-#else
-  output->colorspace = MODE_RGBA;
-#endif
-  output->width  = surface->w;
-  output->height = surface->h;
-  output->u.RGBA.rgba   = surface->pixels;
-  output->u.RGBA.stride = surface->pitch;
-  output->u.RGBA.size   = surface->pitch * surface->h;
-  output->is_external_memory = 1;
-
-  status = WebPDecode((const uint8_t*)data, (size_t)data_size, &config);
-  if (status != VP8_STATUS_OK) {
-    fprintf(stderr, "Error decoding image (%d)\n", status);
-    goto Error;
-  }
-
-  if (SDL_MUSTLOCK(surface)) SDL_UnlockSurface(surface);
-  if (SDL_BlitSurface(surface, NULL, screen, NULL) ||
-      SDL_Flip(screen)) {
-    goto Error;
-  }
-
-  ok = 1;
-
- Error:
-  SDL_FreeSurface(surface);
-  SDL_FreeSurface(screen);
-  return ok;
-}
-
-//------------------------------------------------------------------------------
-
-#endif  // WEBP_HAVE_SDL
--- a/extras/webp_to_sdl.h
+++ b/extras/webp_to_sdl.h
@@ -1,22 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Simple WebP-to-SDL wrapper. Useful for emscripten.
-//
-// Author: James Zern (jzern@google.com)
-
-#ifndef WEBP_EXTRAS_WEBP_TO_SDL_H_
-#define WEBP_EXTRAS_WEBP_TO_SDL_H_
-
-// Exports the method WebpToSDL(const char* data, int data_size) which decodes
-// a WebP bitstream into an RGBA SDL surface.
-// Return false on failure.
-extern int WebpToSDL(const char* data, unsigned int data_size);
-
-#endif  // WEBP_EXTRAS_WEBP_TO_SDL_H_
--- a/imageio/Android.mk
+++ b/imageio/Android.mk
@@ -25,7 +25,6 @@ LOCAL_SRC_FILES := \
    jpegdec.c \
    metadata.c \
    pngdec.c \
-    pnmdec.c \
    tiffdec.c \
    webpdec.c \

--- a/imageio/Makefile.am
+++ b/imageio/Makefile.am
@@ -11,7 +11,6 @@ libimagedec_la_SOURCES  = image_dec.c image_dec.h
 libimagedec_la_SOURCES += jpegdec.c jpegdec.h
 libimagedec_la_SOURCES += metadata.c metadata.h
 libimagedec_la_SOURCES += pngdec.c pngdec.h
-libimagedec_la_SOURCES += pnmdec.c pnmdec.h
 libimagedec_la_SOURCES += tiffdec.c tiffdec.h
 libimagedec_la_SOURCES += webpdec.c webpdec.h
 libimagedec_la_SOURCES += wicdec.c wicdec.h
--- a/imageio/image_dec.c
+++ b/imageio/image_dec.c
@@ -29,10 +29,6 @@ WebPInputFileFormat WebPGuessImageType(const uint8_t* const data,
      format = WEBP_TIFF_FORMAT;
    } else if (magic1 == 0x52494646 && magic2 == 0x57454250) {
      format = WEBP_WEBP_FORMAT;
-    } else if (((magic1 >> 24) & 0xff) == 'P') {
-      const int type = (magic1 >> 16) & 0xff;
-      // we only support 'P5 -> P7' for now.
-      if (type >= '5' && type <= '7') format = WEBP_PNM_FORMAT;
    }
  }
  return format;
@@ -55,7 +51,6 @@ WebPImageReader WebPGetImageReader(WebPInputFileFormat format) {
    case WEBP_JPEG_FORMAT: return ReadJPEG;
    case WEBP_TIFF_FORMAT: return ReadTIFF;
    case WEBP_WEBP_FORMAT: return ReadWebP;
-    case WEBP_PNM_FORMAT: return ReadPNM;
    default: return FailReader;
  }
 }
--- a/imageio/image_dec.h
+++ b/imageio/image_dec.h
@@ -23,7 +23,6 @@
 #include "./metadata.h"
 #include "./jpegdec.h"
 #include "./pngdec.h"
-#include "./pnmdec.h"
 #include "./tiffdec.h"
 #include "./webpdec.h"
 #include "./wicdec.h"
@@ -37,7 +36,6 @@ typedef enum {
  WEBP_JPEG_FORMAT,
  WEBP_TIFF_FORMAT,
  WEBP_WEBP_FORMAT,
-  WEBP_PNM_FORMAT,
  WEBP_UNSUPPORTED_FORMAT
 } WebPInputFileFormat;

--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@@ -361,8 +361,6 @@ int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
  const uint8_t* rgba = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
-  const uint8_t assoc_alpha =
-      WebPIsPremultipliedMode(buffer->colorspace) ? 1 : 2;
  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
                                            : NUM_IFD_ENTRIES - 1;
@@ -390,8 +388,7 @@ int WebPWriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
-    0x52, 0x01, 3, 0, 1, 0, 0, 0,
-        assoc_alpha, 0, 0, 0,                    // 178: ExtraSamples: rgbA/RGBA
+    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
    0, 0, 0, 0,                                  // 190: IFD terminator
    // EXTRA_DATA_OFFSET:
    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@@ -112,7 +112,7 @@ int ImgIoUtilWriteFile(const char* const file_name,
  if (data == NULL) {
    return 0;
  }
-  out = to_stdout ? ImgIoUtilSetBinaryMode(stdout) : fopen(file_name, "wb");
+  out = to_stdout ? stdout : fopen(file_name, "wb");
  if (out == NULL) {
    fprintf(stderr, "Error! Cannot open output file '%s'\n", file_name);
    return 0;
--- a/imageio/pnmdec.c
+++ b/imageio/pnmdec.c
@@ -1,252 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// (limited) PNM decoder
-
-#include "./pnmdec.h"
-
-#include <assert.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "webp/encode.h"
-#include "./imageio_util.h"
-
-typedef enum {
-  WIDTH_FLAG      = 1 << 0,
-  HEIGHT_FLAG     = 1 << 1,
-  DEPTH_FLAG      = 1 << 2,
-  MAXVAL_FLAG     = 1 << 3,
-  TUPLE_FLAG      = 1 << 4,
-  ALL_NEEDED_FLAGS = 0x1f
-} PNMFlags;
-
-typedef struct {
-  const uint8_t* data;
-  size_t data_size;
-  int width, height;
-  int bytes_per_px;   // 1, 3, 4
-  int depth;
-  int max_value;
-  int type;           // 5, 6 or 7
-  int seen_flags;
-} PNMInfo;
-
-// -----------------------------------------------------------------------------
-// PNM decoding
-
-#define MAX_LINE_SIZE 1024
-static const size_t kMinPNMHeaderSize = 3;
-
-static size_t ReadLine(const uint8_t* const data, size_t off, size_t data_size,
-                       char out[MAX_LINE_SIZE + 1], size_t* const out_size) {
-  size_t i = 0;
-  *out_size = 0;
- redo:
-  for (i = 0; i < MAX_LINE_SIZE && off < data_size; ++i) {
-    out[i] = data[off++];
-    if (out[i] == '\n') break;
-  }
-  if (off < data_size) {
-    if (i == 0) goto redo;         // empty line
-    if (out[0] == '#') goto redo;  // skip comment
-  }
-  out[i] = 0;   // safety sentinel
-  *out_size = i;
-  return off;
-}
-
-static size_t FlagError(const char flag[]) {
-  fprintf(stderr, "PAM header error: flags '%s' already seen.\n", flag);
-  return 0;
-}
-
-// inspired from http://netpbm.sourceforge.net/doc/pam.html
-static size_t ReadPAMFields(PNMInfo* const info, size_t off) {
-  char out[MAX_LINE_SIZE + 1];
-  size_t out_size;
-  int tmp;
-  assert(info != NULL);
-  while (1) {
-    off = ReadLine(info->data, off, info->data_size, out, &out_size);
-    if (off == 0) return 0;
-    if (sscanf(out, "WIDTH %d", &tmp) == 1) {
-      if (info->seen_flags & WIDTH_FLAG) return FlagError("WIDTH");
-      info->seen_flags |= WIDTH_FLAG;
-      info->width = tmp;
-    } else if (sscanf(out, "HEIGHT %d", &tmp) == 1) {
-      if (info->seen_flags & HEIGHT_FLAG) return FlagError("HEIGHT");
-      info->seen_flags |= HEIGHT_FLAG;
-      info->height = tmp;
-    } else if (sscanf(out, "DEPTH %d", &tmp) == 1) {
-      if (info->seen_flags & DEPTH_FLAG) return FlagError("DEPTH");
-      info->seen_flags |= DEPTH_FLAG;
-      info->depth = tmp;
-    } else if (sscanf(out, "MAXVAL %d", &tmp) == 1) {
-      if (info->seen_flags & MAXVAL_FLAG) return FlagError("MAXVAL");
-      info->seen_flags |= MAXVAL_FLAG;
-      info->max_value = tmp;
-    } else if (!strcmp(out, "TUPLTYPE RGB_ALPHA")) {
-      info->bytes_per_px = 4;
-      info->seen_flags |= TUPLE_FLAG;
-    } else if (!strcmp(out, "TUPLTYPE RGB")) {
-      info->bytes_per_px = 3;
-      info->seen_flags |= TUPLE_FLAG;
-    } else if (!strcmp(out, "TUPLTYPE GRAYSCALE")) {
-      info->bytes_per_px = 1;
-      info->seen_flags |= TUPLE_FLAG;
-    } else if (!strcmp(out, "ENDHDR")) {
-      break;
-    } else {
-      static const char kEllipsis[] = " ...";
-      int i;
-      if (out_size > 20) sprintf(out + 20 - strlen(kEllipsis), kEllipsis);
-      for (i = 0; i < (int)strlen(out); ++i) {
-        if (!isprint(out[i])) out[i] = ' ';
-      }
-      fprintf(stderr, "PAM header error: unrecognized entry [%s]\n", out);
-      return 0;
-    }
-  }
-  if (!(info->seen_flags & TUPLE_FLAG)) {
-    info->seen_flags |= TUPLE_FLAG;
-    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
-  }
-  if (info->seen_flags != ALL_NEEDED_FLAGS) {
-    fprintf(stderr, "PAM: incomplete header.\n");
-    return 0;
-  }
-  return off;
-}
-
-static size_t ReadHeader(PNMInfo* const info) {
-  size_t off = 0;
-  char out[MAX_LINE_SIZE + 1];
-  size_t out_size;
-  if (info == NULL) return 0;
-  if (info->data == NULL || info->data_size < kMinPNMHeaderSize) return 0;
-
-  info->width = info->height = 0;
-  info->type = -1;
-  info->seen_flags = 0;
-  info->bytes_per_px = 0;
-  info->depth = 0;
-  info->max_value = 0;
-
-  off = ReadLine(info->data, off, info->data_size, out, &out_size);
-  if (off == 0 || sscanf(out, "P%d", &info->type) != 1) return 0;
-  if (info->type == 7) {
-    off = ReadPAMFields(info, off);
-  } else {
-    off = ReadLine(info->data, off, info->data_size, out, &out_size);
-    if (off == 0 || sscanf(out, "%d %d", &info->width, &info->height) != 2) {
-      return 0;
-    }
-    off = ReadLine(info->data, off, info->data_size, out, &out_size);
-    if (off == 0 || sscanf(out, "%d", &info->max_value) != 1) return 0;
-
-    // finish initializing missing fields
-    info->depth = (info->type == 5) ? 1 : 3;
-    info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
-  }
-  // perform some basic numerical validation
-  if (info->width <= 0 || info->height <= 0 ||
-      info->type <= 0 || info->type >= 9 ||
-      info->depth <= 0 || info->depth > 4 ||
-      info->bytes_per_px < info->depth ||
-      info->max_value <= 0 || info->max_value >= 65536) {
-    return 0;
-  }
-  return off;
-}
-
-int ReadPNM(const uint8_t* const data, size_t data_size,
-            WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata) {
-  int ok = 0;
-  int i, j;
-  uint64_t stride, pixel_bytes;
-  uint8_t* rgb = NULL, *tmp_rgb;
-  size_t offset;
-  PNMInfo info;
-
-  info.data = data;
-  info.data_size = data_size;
-  offset = ReadHeader(&info);
-  if (offset == 0) {
-    fprintf(stderr, "Error parsing PNM header.\n");
-    goto End;
-  }
-
-  if (info.type < 5 || info.type > 7) {
-    fprintf(stderr, "Unsupported P%d PNM format.\n", info.type);
-    goto End;
-  }
-
-  // Some basic validations.
-  if (pic == NULL) goto End;
-  if (info.width > WEBP_MAX_DIMENSION || info.height > WEBP_MAX_DIMENSION) {
-    fprintf(stderr, "Invalid %dx%d dimension for PNM\n",
-                    info.width, info.height);
-    goto End;
-  }
-
-  pixel_bytes = (uint64_t)info.width * info.height * info.bytes_per_px;
-  if (data_size < offset + pixel_bytes) {
-    fprintf(stderr, "Truncated PNM file (P%d).\n", info.type);
-    goto End;
-  }
-  stride =
-      (uint64_t)(info.bytes_per_px < 3 ? 3 : info.bytes_per_px) * info.width;
-  if (stride != (size_t)stride ||
-      !ImgIoUtilCheckSizeArgumentsOverflow(stride, info.height)) {
-    goto End;
-  }
-
-  rgb = (uint8_t*)malloc((size_t)stride * info.height);
-  if (rgb == NULL) goto End;
-
-  // Convert input
-  tmp_rgb = rgb;
-  for (j = 0; j < info.height; ++j) {
-    assert(offset + info.bytes_per_px * info.width <= data_size);
-    if (info.depth == 1) {
-      // convert grayscale -> RGB
-      for (i = 0; i < info.width; ++i) {
-        const uint8_t v = data[offset + i];
-        tmp_rgb[3 * i + 0] = tmp_rgb[3 * i + 1] = tmp_rgb[3 * i + 2] = v;
-      }
-    } else if (info.depth == 3) {   // RGB
-      memcpy(tmp_rgb, data + offset, 3 * info.width * sizeof(*data));
-    } else if (info.depth == 4) {   // RGBA
-      memcpy(tmp_rgb, data + offset, 4 * info.width * sizeof(*data));
-    }
-    offset += info.bytes_per_px * info.width;
-    tmp_rgb += stride;
-  }
-
-  // WebP conversion.
-  pic->width = info.width;
-  pic->height = info.height;
-  ok = (info.depth == 4) ? WebPPictureImportRGBA(pic, rgb, (int)stride)
-                         : WebPPictureImportRGB(pic, rgb, (int)stride);
-  if (!ok) goto End;
-
-  ok = 1;
- End:
-  free((void*)rgb);
-
-  (void)metadata;
-  (void)keep_alpha;
-  return ok;
-}
-
-// -----------------------------------------------------------------------------
--- a/imageio/pnmdec.h
+++ b/imageio/pnmdec.h
@@ -1,37 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// partial PNM format decoder (ppm/pgm)
-
-#ifndef WEBP_IMAGEIO_PNMDEC_H_
-#define WEBP_IMAGEIO_PNMDEC_H_
-
-#include "webp/types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct Metadata;
-struct WebPPicture;
-
-// Reads a PNM file from 'data', returning the decoded output in 'pic'.
-// The output is RGB or YUV depending on pic->use_argb value.
-// Returns true on success.
-// 'metadata' has no effect, but is kept for coherence with other signatures
-// for image readers.
-int ReadPNM(const uint8_t* const data, size_t data_size,
-            struct WebPPicture* const pic, int keep_alpha,
-            struct Metadata* const metadata);
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  // WEBP_IMAGEIO_PNMDEC_H_
--- a/imageio/tiffdec.c
+++ b/imageio/tiffdec.c
@@ -15,7 +15,6 @@
 #include "webp/config.h"
 #endif

-#include <limits.h>
 #include <stdio.h>
 #include <string.h>

@@ -108,7 +107,7 @@ static void MyUnmapFile(thandle_t opaque, void* base, toff_t size) {
 static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
  MyData* const my_data = (MyData*)opaque;
  if (my_data->pos + size > my_data->size) {
-    size = (tsize_t)(my_data->size - my_data->pos);
+    size = my_data->size - my_data->pos;
  }
  if (size > 0) {
    memcpy(dst, my_data->data + my_data->pos, size);
@@ -117,55 +116,18 @@ static tsize_t MyRead(thandle_t opaque, void* dst, tsize_t size) {
  return size;
 }

-// Unmultiply Argb data. Taken from dsp/alpha_processing
-// (we don't want to force a dependency to a libdspdec library).
-#define MFIX 24    // 24bit fixed-point arithmetic
-#define HALF ((1u << MFIX) >> 1)
-#define KINV_255 ((1u << MFIX) / 255u)
-
-static uint32_t Unmult(uint8_t x, uint32_t mult) {
-  const uint32_t v = (x * mult + HALF) >> MFIX;
-  return (v > 255u) ? 255u : v;
-}
-
-static WEBP_INLINE uint32_t GetScale(uint32_t a) {
-  return (255u << MFIX) / a;
-}
-
-static void MultARGBRow(uint8_t* ptr, int width) {
-  int x;
-  for (x = 0; x < width; ++x, ptr += 4) {
-    const uint32_t alpha = ptr[3];
-    if (alpha < 255) {
-      if (alpha == 0) {   // alpha == 0
-        ptr[0] = ptr[1] = ptr[2] = 0;
-      } else {
-        const uint32_t scale = GetScale(alpha);
-        ptr[0] = Unmult(ptr[0], scale);
-        ptr[1] = Unmult(ptr[1], scale);
-        ptr[2] = Unmult(ptr[2], scale);
-      }
-    }
-  }
-}
-
 int ReadTIFF(const uint8_t* const data, size_t data_size,
             WebPPicture* const pic, int keep_alpha,
             Metadata* const metadata) {
  MyData my_data = { data, (toff_t)data_size, 0 };
  TIFF* tif;
-  uint32_t width, height;
-  uint16_t samples_per_px = 0;
-  uint16_t extra_samples = 0;
-  uint16_t* extra_samples_ptr = NULL;
-  uint32_t* raster;
+  uint32 width, height;
+  uint32* raster;
  int64_t alloc_size;
  int ok = 0;
  tdir_t dircount;

-  if (data == NULL || data_size == 0 || data_size > INT_MAX || pic == NULL) {
-    return 0;
-  }
+  if (data == NULL || data_size == 0 || pic == NULL) return 0;

  tif = TIFFClientOpen("Memory", "r", &my_data,
                       MyRead, MyRead, MySeek, MyClose,
@@ -181,27 +143,17 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
                    "Only the first will be used, %d will be ignored.\n",
                    dircount - 1);
  }
-  if (!TIFFGetFieldDefaulted(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_px)) {
-    fprintf(stderr, "Error! Cannot retrieve TIFF samples-per-pixel info.\n");
-    goto End;
-  }
-  if (samples_per_px < 3 || samples_per_px > 4) goto End;  // not supported

  if (!(TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width) &&
        TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height))) {
    fprintf(stderr, "Error! Cannot retrieve TIFF image dimensions.\n");
    goto End;
  }
+
  if (!ImgIoUtilCheckSizeArgumentsOverflow((uint64_t)width * height,
                                           sizeof(*raster))) {
    goto End;
  }
-  if (samples_per_px > 3 && !TIFFGetField(tif, TIFFTAG_EXTRASAMPLES,
-                                          &extra_samples, &extra_samples_ptr)) {
-    fprintf(stderr, "Error! Cannot retrieve TIFF ExtraSamples info.\n");
-    goto End;
-  }
-
  // _Tiffmalloc uses a signed type for size.
  alloc_size = (int64_t)((uint64_t)width * height * sizeof(*raster));
  if (alloc_size < 0 || alloc_size != (tsize_t)alloc_size) goto End;
@@ -217,16 +169,6 @@ int ReadTIFF(const uint8_t* const data, size_t data_size,
 #ifdef WORDS_BIGENDIAN
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
-      // if we have an alpha channel, we must un-multiply from rgbA to RGBA
-      if (extra_samples == 1 && extra_samples_ptr != NULL &&
-          extra_samples_ptr[0] == EXTRASAMPLE_ASSOCALPHA) {
-        uint32_t y;
-        uint8_t* tmp = (uint8_t*)raster;
-        for (y = 0; y < height; ++y) {
-          MultARGBRow(tmp, width);
-          tmp += stride;
-        }
-      }
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
--- a/imageio/webpdec.c
+++ b/imageio/webpdec.c
@@ -138,53 +138,46 @@ int ReadWebP(const uint8_t* const data, size_t data_size,
    PrintWebPError("input data", status);
    return 0;
  }
-
-  do {
+  {
    const int has_alpha = keep_alpha && bitstream->has_alpha;
-    pic->width = bitstream->width;
-    pic->height = bitstream->height;
-    if (!pic->use_argb) pic->colorspace = has_alpha ? WEBP_YUV420A
-                                                    : WEBP_YUV420;
-    ok = WebPPictureAlloc(pic);
-    if (!ok) {
-      status = VP8_STATUS_OUT_OF_MEMORY;
-      break;
-    }
    if (pic->use_argb) {
-      output_buffer->colorspace = MODE_BGRA;
-      output_buffer->u.RGBA.rgba = (uint8_t*)pic->argb;
-      output_buffer->u.RGBA.stride = pic->argb_stride * sizeof(uint32_t);
-      output_buffer->u.RGBA.size = output_buffer->u.RGBA.stride * pic->height;
+      output_buffer->colorspace = has_alpha ? MODE_RGBA : MODE_RGB;
    } else {
      output_buffer->colorspace = has_alpha ? MODE_YUVA : MODE_YUV;
-      output_buffer->u.YUVA.y = pic->y;
-      output_buffer->u.YUVA.u = pic->u;
-      output_buffer->u.YUVA.v = pic->v;
-      output_buffer->u.YUVA.a = has_alpha ? pic->a : NULL;
-      output_buffer->u.YUVA.y_stride = pic->y_stride;
-      output_buffer->u.YUVA.u_stride = pic->uv_stride;
-      output_buffer->u.YUVA.v_stride = pic->uv_stride;
-      output_buffer->u.YUVA.a_stride = has_alpha ? pic->a_stride : 0;
-      output_buffer->u.YUVA.y_size = pic->height * pic->y_stride;
-      output_buffer->u.YUVA.u_size = (pic->height + 1) / 2 * pic->uv_stride;
-      output_buffer->u.YUVA.v_size = (pic->height + 1) / 2 * pic->uv_stride;
-      output_buffer->u.YUVA.a_size = pic->height * pic->a_stride;
    }
-    output_buffer->is_external_memory = 1;

    status = DecodeWebP(data, data_size, &config);
-    ok = (status == VP8_STATUS_OK);
-    if (!ok) WebPPictureFree(pic);
-    if (ok && !keep_alpha && pic->use_argb) {
-      // Need to wipe out the alpha value, as requested.
-      int x, y;
-      uint32_t* argb = pic->argb;
-      for (y = 0; y < pic->height; ++y) {
-        for (x = 0; x < pic->width; ++x) argb[x] |= 0xff000000u;
-        argb += pic->argb_stride;
+    if (status == VP8_STATUS_OK) {
+      pic->width = output_buffer->width;
+      pic->height = output_buffer->height;
+      if (pic->use_argb) {
+        const uint8_t* const rgba = output_buffer->u.RGBA.rgba;
+        const int stride = output_buffer->u.RGBA.stride;
+        ok = has_alpha ? WebPPictureImportRGBA(pic, rgba, stride)
+                       : WebPPictureImportRGB(pic, rgba, stride);
+      } else {
+        pic->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+        ok = WebPPictureAlloc(pic);
+        if (!ok) {
+          status = VP8_STATUS_OUT_OF_MEMORY;
+        } else {
+          const WebPYUVABuffer* const yuva = &output_buffer->u.YUVA;
+          const int uv_width = (pic->width + 1) >> 1;
+          const int uv_height = (pic->height + 1) >> 1;
+          ImgIoUtilCopyPlane(yuva->y, yuva->y_stride,
+                             pic->y, pic->y_stride, pic->width, pic->height);
+          ImgIoUtilCopyPlane(yuva->u, yuva->u_stride,
+                             pic->u, pic->uv_stride, uv_width, uv_height);
+          ImgIoUtilCopyPlane(yuva->v, yuva->v_stride,
+                             pic->v, pic->uv_stride, uv_width, uv_height);
+          if (has_alpha) {
+            ImgIoUtilCopyPlane(yuva->a, yuva->a_stride,
+                               pic->a, pic->a_stride, pic->width, pic->height);
+          }
+        }
      }
    }
-  } while (0);   // <- so we can 'break' out of the loop
+  }

  if (status != VP8_STATUS_OK) {
    PrintWebPError("input data", status);
--- a/imageio/webpdec.h
+++ b/imageio/webpdec.h
@@ -51,7 +51,7 @@ VP8StatusCode DecodeWebPIncremental(

 //------------------------------------------------------------------------------

-// Decodes a WebP contained in 'data', returning the decoded output in 'pic'.
+// Reads a WebP from 'in_file', returning the decoded output in 'pic'.
 // Output is RGBA or YUVA, depending on pic->use_argb value.
 // If 'keep_alpha' is true and the WebP has an alpha channel, the output is RGBA
 // or YUVA. Otherwise, alpha channel is dropped and output is RGB or YUV.
--- a/makefile.unix
+++ b/makefile.unix
@@ -29,8 +29,6 @@ ifeq ($(strip $(shell uname)), Darwin)
  EXTRA_LIBS  += -L/opt/local/lib
  GL_LIBS = -framework GLUT -framework OpenGL
 else
-  EXTRA_FLAGS += -I/usr/local/include
-  EXTRA_LIBS  += -L/usr/local/lib
  GL_LIBS = -lglut -lGL
 endif

@@ -169,7 +167,6 @@ DSP_DEC_OBJS = \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
    src/dsp/yuv_mips_dsp_r2.o \
-    src/dsp/yuv_neon.o \
    src/dsp/yuv_sse2.o \

 DSP_ENC_OBJS = \
@@ -195,13 +192,10 @@ DSP_ENC_OBJS = \
    src/dsp/lossless_enc_neon.o \
    src/dsp/lossless_enc_sse2.o \
    src/dsp/lossless_enc_sse41.o \
-    src/dsp/ssim.o \
-    src/dsp/ssim_sse2.o \

 ENC_OBJS = \
    src/enc/alpha_enc.o \
    src/enc/analysis_enc.o \
-    src/enc/backward_references_cost_enc.o \
    src/enc/backward_references_enc.o \
    src/enc/config_enc.o \
    src/enc/cost_enc.o \
@@ -229,7 +223,6 @@ EX_FORMAT_DEC_OBJS = \
    imageio/jpegdec.o \
    imageio/metadata.o \
    imageio/pngdec.o \
-    imageio/pnmdec.o \
    imageio/tiffdec.o \
    imageio/webpdec.o \

@@ -335,8 +328,8 @@ OUT_LIBS += src/libwebp.a
 EXTRA_LIB = extras/libwebpextras.a
 OUT_EXAMPLES = examples/cwebp examples/dwebp
 EXTRA_EXAMPLES = examples/gif2webp examples/vwebp examples/webpmux \
-                 examples/anim_diff examples/img2webp examples/webpinfo
-OTHER_EXAMPLES = extras/get_disto extras/webp_quality extras/vwebp_sdl
+                 examples/anim_diff examples/img2webp
+OTHER_EXAMPLES = extras/get_disto extras/webp_quality

 OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
@@ -387,7 +380,6 @@ examples/gif2webp: examples/gif2webp.o $(GIFDEC_OBJS)
 examples/vwebp: examples/vwebp.o
 examples/webpmux: examples/webpmux.o
 examples/img2webp: examples/img2webp.o
-examples/webpinfo: examples/webpinfo.o

 examples/anim_diff: examples/libanim_util.a examples/libgifdec.a
 examples/anim_diff: src/demux/libwebpdemux.a examples/libexample_util.a
@@ -419,8 +411,6 @@ examples/img2webp: examples/libexample_util.a imageio/libimageio_util.a
 examples/img2webp: imageio/libimagedec.a
 examples/img2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/img2webp: EXTRA_LIBS += $(CWEBP_LIBS)
-examples/webpinfo: examples/libexample_util.a imageio/libimageio_util.a
-examples/webpinfo: src/libwebpdecoder.a

 extras/get_disto: extras/get_disto.o
 extras/get_disto: imageio/libimagedec.a imageio/libimageio_util.a src/libwebp.a
@@ -430,13 +420,6 @@ extras/webp_quality: extras/webp_quality.o
 extras/webp_quality: imageio/libimageio_util.a
 extras/webp_quality: $(EXTRA_LIB) src/libwebp.a

-extras/vwebp_sdl: extras/vwebp_sdl.o
-extras/vwebp_sdl: extras/webp_to_sdl.o
-extras/vwebp_sdl: imageio/libimageio_util.a
-extras/vwebp_sdl: src/libwebp.a
-extras/vwebp_sdl: EXTRA_FLAGS += -DWEBP_HAVE_SDL
-extras/vwebp_sdl: EXTRA_LIBS += -lSDL
-
 $(OUT_EXAMPLES) $(EXTRA_EXAMPLES) $(OTHER_EXAMPLES):
 	$(CC) -o $@ $^ $(LDFLAGS)

@@ -452,7 +435,7 @@ dist: all
 	$(INSTALL) -m644 src/mux/libwebpmux.a $(DESTDIR)/lib
 	umask 022; \
 	for m in man/[cdv]webp.1 man/gif2webp.1 man/webpmux.1 \
-                 man/img2webp.1 man/webpinfo.1; do \
+                 man/img2webp.1; do \
 	  basenam=$$(basename $$m .1); \
 	  $(GROFF) -t -e -man -T utf8 $$m \
 	    | $(COL) -bx >$(DESTDIR)/doc/$${basenam}.txt; \
--- a/man/Makefile.am
+++ b/man/Makefile.am
@@ -8,7 +8,4 @@ endif
 if BUILD_VWEBP
  man_MANS += vwebp.1
 endif
-if BUILD_WEBPINFO
-  man_MANS += webpinfo.1
-endif
 EXTRA_DIST = $(man_MANS)
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -98,7 +98,8 @@ Crop the source to a rectangle with top\-left corner at coordinates
 This cropping area must be fully contained within the source rectangle.
 .TP
 .B \-mt
-Use multi\-threading for encoding, if possible.
+Use multi\-threading for encoding, if possible. This option is only effective
+when using lossy compression on a source with a transparency channel.
 .TP
 .B \-low_memory
 Reduce memory usage of lossy encoding by saving four times the compressed
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@@ -108,7 +108,8 @@ the value the smoother the picture will appear. Typical values are usually in
 the range of 20 to 50.
 .TP
 .B \-mt
-Use multi-threading for encoding, if possible.
+Use multi-threading for encoding, if possible. This option is only effective
+when using lossy compression.
 .TP
 .B \-v
 Print extra information.
--- a/man/webpinfo.1
+++ b/man/webpinfo.1
@@ -1,77 +0,0 @@
-.\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPINFO 1 "May 08, 2017"
-.SH NAME
-webpinfo \- print out the chunk level structure of WebP files
-along with basic integrity checks.
-.SH SYNOPSIS
-.B webpinfo
-.I OPTIONS
-.I INPUT
-.br
-.B webpinfo [\-h|\-help|\-H|\-longhelp]
-.br
-
-.SH DESCRIPTION
-This manual page documents the
-.B webpinfo
-command.
-.PP
-\fBwebpinfo\fP can be used to print out the chunk level structure and bitstream
-header information of WebP files. It can also check if the files are of valid
-WebP format.
-
-.SH OPTIONS
-.TP
-.B -quiet
-Do not show chunk parsing information.
-.TP
-.B -diag
-Show parsing error diagnosis.
-.TP
-.B -summary
-Show chunk stats summary.
-.TP
-.BI -bitstream_info
-Parse bitstream header.
-.TP
-.B \-h, \-help
-A short usage summary.
-.TP
-.B \-H, \-longhelp
-Detailed usage instructions.
-
-.SH INPUT
-Input files in WebP format. Input files must come last, following
-options (if any). There can be multiple input files.
-
-.SH BUGS
-Please report all bugs to the issue tracker:
-https://bugs.chromium.org/p/webp
-.br
-Patches welcome! See this page to get started:
-http://www.webmproject.org/code/contribute/submitting\-patches/
-
-.SH EXAMPLES
-.br
-webpinfo \-h
-.br
-webpinfo \-diag \-summary input_file.webp
-.br
-webpinfo \-bitstream_info input_file_1.webp input_file_2.webp
-.br
-webpinfo *.webp
-
-.SH AUTHORS
-\fBwebpinfo\fP is a part of libwebp and was written by the WebP team.
-.br
-The latest source tree is available at
-https://chromium.googlesource.com/webm/libwebp
-.PP
-This manual page was written by Hui Su <huisu@google.com>,
-for the Debian project (and may be used by others).
-
-.SH SEE ALSO
-.BR webpmux (1)
-.br
-Please refer to http://developers.google.com/speed/webp/ for additional
-information.
--- a/src/dec/vp8_dec.h
+++ b/src/dec/vp8_dec.h
@@ -33,7 +33,7 @@ extern "C" {
 //   /* customize io's functions (setup()/put()/teardown()) if needed. */
 //
 //   VP8Decoder* dec = VP8New();
-//   int ok = VP8Decode(dec, &io);
+//   bool ok = VP8Decode(dec);
 //   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
 //   VP8Delete(dec);
 //   return ok;
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@@ -1012,13 +1012,12 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
      ok = 0;
      goto End;
    }
-    br->eos_ = VP8LIsEndOfStream(br);
+    assert(br->eos_ == VP8LIsEndOfStream(br));
  }
  // Process the remaining rows corresponding to last row-block.
  ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);

 End:
-  br->eos_ = VP8LIsEndOfStream(br);
  if (!ok || (br->eos_ && pos < end)) {
    ok = 0;
    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@@ -1091,12 +1090,11 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
    VP8LFillBitWindow(br);
    if (htree_group->use_packed_table) {
      code = ReadPackedSymbols(htree_group, br, src);
-      if (VP8LIsEndOfStream(br)) break;
      if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
    } else {
      code = ReadSymbol(htree_group->htrees[GREEN], br);
    }
-    if (VP8LIsEndOfStream(br)) break;
+    if (br->eos_) break;  // early out
    if (code < NUM_LITERAL_CODES) {  // Literal
      if (htree_group->is_trivial_literal) {
        *src = htree_group->literal_arb | (code << 8);
@@ -1106,7 +1104,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
        VP8LFillBitWindow(br);
        blue = ReadSymbol(htree_group->htrees[BLUE], br);
        alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
-        if (VP8LIsEndOfStream(br)) break;
+        if (br->eos_) break;
        *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
      }
    AdvanceByOne:
@@ -1134,7 +1132,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
      VP8LFillBitWindow(br);
      dist_code = GetCopyDistance(dist_symbol, br);
      dist = PlaneCodeToDistance(width, dist_code);
-      if (VP8LIsEndOfStream(br)) break;
+      if (br->eos_) break;
      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
        goto Error;
      } else {
@@ -1171,9 +1169,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
    } else {  // Not reached
      goto Error;
    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
  }

-  br->eos_ = VP8LIsEndOfStream(br);
  if (dec->incremental_ && br->eos_ && src < src_end) {
    RestoreState(dec);
  } else if (!br->eos_) {
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -3,7 +3,6 @@ noinst_LTLIBRARIES += libwebpdsp_sse2.la libwebpdspdecode_sse2.la
 noinst_LTLIBRARIES += libwebpdsp_sse41.la libwebpdspdecode_sse41.la
 noinst_LTLIBRARIES += libwebpdsp_neon.la libwebpdspdecode_neon.la
 noinst_LTLIBRARIES += libwebpdsp_msa.la libwebpdspdecode_msa.la
-noinst_LTLIBRARIES += libwebpdspdecode_wasm.la

 if BUILD_LIBWEBPDECODER
  noinst_LTLIBRARIES += libwebpdspdecode.la
@@ -51,7 +50,6 @@ ENC_SOURCES += enc_mips_dsp_r2.c
 ENC_SOURCES += lossless_enc.c
 ENC_SOURCES += lossless_enc_mips32.c
 ENC_SOURCES += lossless_enc_mips_dsp_r2.c
-ENC_SOURCES += ssim.c

 libwebpdsp_avx2_la_SOURCES =
 libwebpdsp_avx2_la_SOURCES += enc_avx2.c
@@ -83,7 +81,6 @@ libwebpdspdecode_neon_la_SOURCES += lossless_neon.c
 libwebpdspdecode_neon_la_SOURCES += neon.h
 libwebpdspdecode_neon_la_SOURCES += rescaler_neon.c
 libwebpdspdecode_neon_la_SOURCES += upsampling_neon.c
-libwebpdspdecode_neon_la_SOURCES += yuv_neon.c
 libwebpdspdecode_neon_la_CPPFLAGS = $(libwebpdsp_neon_la_CPPFLAGS)
 libwebpdspdecode_neon_la_CFLAGS = $(libwebpdsp_neon_la_CFLAGS)

@@ -97,16 +94,11 @@ libwebpdspdecode_msa_la_SOURCES += upsampling_msa.c
 libwebpdspdecode_msa_la_CPPFLAGS = $(libwebpdsp_msa_la_CPPFLAGS)
 libwebpdspdecode_msa_la_CFLAGS = $(libwebpdsp_msa_la_CFLAGS)

-# WASM is not fully integrated into configure; the addition here keeps source
-# extraction by cmake simple.
-libwebpdspdecode_wasm_la_SOURCES = dec_wasm.c
-
 libwebpdsp_sse2_la_SOURCES =
 libwebpdsp_sse2_la_SOURCES += argb_sse2.c
 libwebpdsp_sse2_la_SOURCES += cost_sse2.c
 libwebpdsp_sse2_la_SOURCES += enc_sse2.c
 libwebpdsp_sse2_la_SOURCES += lossless_enc_sse2.c
-libwebpdsp_sse2_la_SOURCES += ssim_sse2.c
 libwebpdsp_sse2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_sse2_la_CFLAGS = $(AM_CFLAGS) $(SSE2_FLAGS)
 libwebpdsp_sse2_la_LIBADD = libwebpdspdecode_sse2.la
--- a/src/dsp/argb_sse2.c
+++ b/src/dsp/argb_sse2.c
@@ -12,7 +12,6 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include "./dsp.h"
-#include "./lossless.h"

 #if defined(WEBP_USE_SSE2)

@@ -20,13 +19,30 @@
 #include <emmintrin.h>
 #include <string.h>

+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
 static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                     const uint8_t* b, int len, uint32_t* out) {
-  (void)a;
  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
+    int i = 0;
+    const int len_max = len & ~3;  // max length processed in main loop
+    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
    assert(b == r + 2);
    assert(a == r + 3);
-    VP8LConvertBGRAToRGBA((const uint32_t*)r, len, (uint8_t*)out);
+    for (; i < len_max; i += 4) {
+      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
+      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
+      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
+      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i F = _mm_or_si128(E, C);
+      _mm_storeu_si128((__m128i*)(out + i), F);
+    }
+    for (; i < len; ++i) {
+      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+    }
  } else {
    assert(g == b + 1);
    assert(r == b + 2);
@@ -39,10 +55,8 @@ static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
 // Entry point

 extern void VP8EncDspARGBInitSSE2(void);
-extern void VP8LDspInitSSE2(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
-  VP8LDspInitSSE2();
  VP8PackARGB = PackARGB;
 }

--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -23,13 +23,11 @@
 #endif

 //------------------------------------------------------------------------------
-// x86/x86-64 micro-arch detection.
+// SSE2 detection.
 //

-// skip x86 specific code for WASM builds
-#if defined(WEBP_USE_WASM)
 // apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
-#elif (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
@@ -65,10 +63,8 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
 #define GetCPUInfo __cpuid
 #endif

-// skip xgetbv definition for WASM builds
-#if defined(WEBP_USE_WASM)
 // NaCl has no support for xgetbv or the raw opcode.
-#elif !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
@@ -98,19 +94,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif

-//------------------------------------------------------------------------------
-// Platform specific VP8CPUInfo functions.
-//
-
-// WASM needs to precede platform specific architecture checks as the defines
-// will still be present when building this target.
-#if defined(WEBP_USE_WASM)
-static int wasmCPUInfo(CPUFeature feature) {
-  if (feature != kWASM) return 0;
-  return 1;
-}
-VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
-#elif defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)

 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -700,7 +700,6 @@ extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
-extern void VP8DspInitWASM(void);

 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
@@ -790,11 +789,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
    if (VP8GetCPUInfo(kMSA)) {
      VP8DspInitMSA();
    }
-#endif
-#if defined(WEBP_USE_WASM)
-    if (VP8GetCPUInfo(kWASM)) {
-      VP8DspInitWASM();
-    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
--- a/src/dsp/dec_msa.c
+++ b/src/dsp/dec_msa.c
@@ -222,7 +222,6 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const v16i8 cnst4b = __msa_ldi_b(4);                        \
  const v16i8 cnst3b = __msa_ldi_b(3);                        \
  const v8i16 cnst9h = __msa_ldi_h(9);                        \
-  const v8i16 cnst63h = __msa_ldi_h(63);                      \
                                                              \
  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
  filt = __msa_subs_s_b(p1_m, q1_m);                          \
@@ -242,9 +241,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
  /* update q2/p2 */                                          \
  temp0 = filt_r * cnst9h;                                    \
-  temp1 = temp0 + cnst63h;                                    \
+  temp1 = ADDVI_H(temp0, 63);                                 \
  temp2 = filt_l * cnst9h;                                    \
-  temp3 = temp2 + cnst63h;                                    \
+  temp3 = ADDVI_H(temp2, 63);                                 \
  FILT2(q2_m, p2_m, q2, p2);                                  \
  /* update q1/p1 */                                          \
  temp1 = temp1 + temp0;                                      \
@@ -709,7 +708,7 @@ static void VE4(uint8_t* dst) {    // vertical
  const uint32_t val0 = LW(ptop + 0);
  const uint32_t val1 = LW(ptop + 4);
  uint32_t out;
-  v16u8 A = { 0 }, B, C, AC, B2, R;
+  v16u8 A, B, C, AC, B2, R;

  INSERT_W2_UB(val0, val1, A);
  B = SLDI_UB(A, A, 1);
@@ -726,7 +725,7 @@ static void RD4(uint8_t* dst) {   // Down-right
  uint32_t val0 = LW(ptop + 0);
  uint32_t val1 = LW(ptop + 4);
  uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R, A1 = { 0 };
+  v16u8 A, B, C, AC, B2, R, A1;

  INSERT_W2_UB(val0, val1, A1);
  A = SLDI_UB(A1, A1, 12);
@@ -754,7 +753,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
  uint32_t val0 = LW(ptop + 0);
  uint32_t val1 = LW(ptop + 4);
  uint32_t val2, val3;
-  v16u8 A = { 0 }, B, C, AC, B2, R;
+  v16u8 A, B, C, AC, B2, R;

  INSERT_W2_UB(val0, val1, A);
  B = SLDI_UB(A, A, 1);
--- a/src/dsp/dec_wasm.c
+++ b/src/dsp/dec_wasm.c
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -38,23 +38,10 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

-#if defined(__clang__)
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-# define LOCAL_CLANG_PREREQ(maj, min) \
-    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
-#else
-# define LOCAL_CLANG_VERSION 0
-# define LOCAL_CLANG_PREREQ(maj, min) 0
-#endif
-
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif

-// For now, none of the optimizations below are available in emscripten.
-// WebAssembly overrides native optimizations.
-#if !(defined(EMSCRIPTEN) || defined(WEBP_USE_WASM))
-
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -81,17 +68,15 @@ extern "C" {
 #define WEBP_USE_AVX2
 #endif

-// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
-// inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || \
-     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
-    !defined(__native_client__)
-#define WEBP_USE_NEON
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+#define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif

-#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
-    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
-#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+    !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif

@@ -115,8 +100,6 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif

-#endif  /* EMSCRIPTEN */
-
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -156,8 +139,7 @@ typedef enum {
  kNEON,
  kMIPS32,
  kMIPSdspR2,
-  kMSA,
-  kWASM
+  kMSA
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -690,6 +690,140 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
  Copy(src, dst, 16, 8);
 }

+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+//------------------------------------------------------------------------------
+
+static uint32_t AccumulateSSE(const uint8_t* src1,
+                              const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+//------------------------------------------------------------------------------
+
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+
+extern void VP8SSIMDspInitSSE2(void);
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+
+  VP8AccumulateSSE = AccumulateSSE;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
 //------------------------------------------------------------------------------
 // Initialization

--- a/src/dsp/enc_msa.c
+++ b/src/dsp/enc_msa.c
@@ -82,7 +82,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  uint32_t in0, in1, in2, in3;
  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  v8i16 t0, t1, t2, t3;
-  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
+  v16u8 srcl0, srcl1, src0, src1;
  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@@ -170,7 +170,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 static int TTransform(const uint8_t* in, const uint16_t* w) {
  int sum;
  uint32_t in0_m, in1_m, in2_m, in3_m;
-  v16i8 src0 = { 0 };
+  v16i8 src0;
  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
  v4i32 dst0, dst1;
  const v16i8 zero = { 0 };
@@ -259,9 +259,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)

 static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
-  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top - 1);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C = SLDI_UB(A, A, 2);
  const v16u8 AC = __msa_ave_u_b(A, C);
@@ -293,9 +292,8 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
 }

 static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
-  const v16u8 A2 = { 0 };
  const uint64_t val_m = LD(top - 5);
-  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C = SLDI_UB(A, A, 2);
@@ -313,9 +311,8 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
 }

 static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
-  const v16u8 A1 = { 0 };
  const uint64_t val_m = LD(top);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
  const v16u8 B = SLDI_UB(A, A, 1);
  const v16u8 C1 = SLDI_UB(A, A, 2);
  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@@ -648,7 +645,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
 static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
                                  const uint8_t* top) {
  uint64_t out;
-  v16u8 src = { 0 };
+  v16u8 src;
  if (top != NULL && left != NULL) {
    const uint64_t left_m = LD(left);
    const uint64_t top_m = LD(top);
@@ -780,7 +777,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  uint32_t sum = 0;
  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
+  v16u8 src, ref, tmp0, tmp1;
  v8i16 diff0, diff1;
  v4i32 out0, out1;

@@ -831,7 +828,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  tmp1 = (tmp3 > maxlevel);
  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
-  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
+  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -1366,8 +1366,119 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
  VP8Mean16x4 = Mean16x4;
 }

+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point (TODO(skal): move to its own file later)
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndAccumulate(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndAccumulate(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndAccumulate(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+static uint32_t HorizontalAdd16b(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b(&xm);
+  stats.ym  = HorizontalAdd16b(&ym);
+  stats.xxm = HorizontalAdd32b(&xxm);
+  stats.xym = HorizontalAdd32b(&xym);
+  stats.yym = HorizontalAdd32b(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+  VP8SSIMGet = SSIMGet_SSE2;
+}
+
 #else  // !WEBP_USE_SSE2

 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)

 #endif  // WEBP_USE_SSE2
--- a/src/dsp/lossless_common.h
+++ b/src/dsp/lossless_common.h
@@ -93,6 +93,14 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 // -----------------------------------------------------------------------------
 // PrefixEncode()

+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1))) {  // zero or a power of two.
+    return log_floor;
+  }
+  return log_floor + 1;
+}
+
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
--- a/src/dsp/lossless_enc.c
+++ b/src/dsp/lossless_enc.c
@@ -520,8 +520,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
    const uint32_t argb = data[i];
    const uint32_t green = argb >> 8;
    const uint32_t red = argb >> 16;
-    int new_red = red & 0xff;
-    int new_blue = argb & 0xff;
+    int new_red = red;
+    int new_blue = argb;
    new_red -= ColorTransformDelta(m->green_to_red_, green);
    new_red &= 0xff;
    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
--- a/src/dsp/lossless_msa.c
+++ b/src/dsp/lossless_msa.c
@@ -43,7 +43,7 @@

 #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
  uint64_t pix_d;                                          \
-  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
+  v16u8 src0, src1, src2, dst0, dst1;                      \
  LD_UB2(psrc, 16, src0, src1);                            \
  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
  ST_UB(dst0, pdst);                                       \
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@@ -272,24 +272,9 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2

 // Predictor10: average of (average of (L,TL), average of (T, TR)).
-#define DO_PRED10(OUT) do {               \
-  __m128i avgLTL, avg;                    \
-  Average2_m128i(&L, &TL, &avgLTL);       \
-  Average2_m128i(&avgTTR, &avgLTL, &avg); \
-  L = _mm_add_epi8(avg, src);             \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
-} while (0)
-
-#define DO_PRED10_SHIFT do {                                  \
-  /* Rotate the pre-computed values for the next iteration.*/ \
-  avgTTR = _mm_srli_si128(avgTTR, 4);                         \
-  TL = _mm_srli_si128(TL, 4);                                 \
-  src = _mm_srli_si128(src, 4);                               \
-} while (0)
-
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
-  int i;
+  int i, j;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -298,88 +283,77 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
    __m128i avgTTR;
    Average2_m128i(&T, &TR, &avgTTR);
-    DO_PRED10(0);
-    DO_PRED10_SHIFT;
-    DO_PRED10(1);
-    DO_PRED10_SHIFT;
-    DO_PRED10(2);
-    DO_PRED10_SHIFT;
-    DO_PRED10(3);
+    for (j = 0; j < 4; ++j) {
+      __m128i avgLTL, avg;
+      Average2_m128i(&L, &TL, &avgLTL);
+      Average2_m128i(&avgTTR, &avgLTL, &avg);
+      L = _mm_add_epi8(avg, src);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Rotate the pre-computed values for the next iteration.
+      avgTTR = _mm_srli_si128(avgTTR, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+    }
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
  }
 }
-#undef DO_PRED10
-#undef DO_PRED10_SHIFT

 // Predictor11: select.
-#define DO_PRED11(OUT) do {                                            \
-  const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
-  const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
-  const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
-  const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
-  const __m128i A = _mm_and_si128(mask, L);                            \
-  const __m128i B = _mm_andnot_si128(mask, T);                         \
-  const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
-  L = _mm_add_epi8(src, pred);                                         \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
-} while (0)
-
-#define DO_PRED11_SHIFT do {                                \
-  /* Shift the pre-computed value for the next iteration.*/ \
-  T = _mm_srli_si128(T, 4);                                 \
-  TL = _mm_srli_si128(TL, 4);                               \
-  src = _mm_srli_si128(src, 4);                             \
-  pa = _mm_srli_si128(pa, 4);                               \
-} while (0)
+static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
+                            __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}

 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
-  int i;
-  __m128i pa;
+  int i, j;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
-    {
-      // We can unpack with any value on the upper 32 bits, provided it's the
-      // same on both operands (so that their sum of abs diff is zero). Here we
-      // use T.
-      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
-      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
-      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
-      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
-      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
-      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
-      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
+    __m128i pa;
+    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
+    for (j = 0; j < 4; ++j) {
+      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
+      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      L = _mm_add_epi8(src, pred);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Shift the pre-computed value for the next iteration.
+      T = _mm_srli_si128(T, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+      pa = _mm_srli_si128(pa, 4);
    }
-    DO_PRED11(0);
-    DO_PRED11_SHIFT;
-    DO_PRED11(1);
-    DO_PRED11_SHIFT;
-    DO_PRED11(2);
-    DO_PRED11_SHIFT;
-    DO_PRED11(3);
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
 }
-#undef DO_PRED11
-#undef DO_PRED11_SHIFT

 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) do {            \
-  const __m128i all = _mm_add_epi16(L, (DIFF));    \
-  const __m128i alls = _mm_packus_epi16(all, all); \
-  const __m128i res = _mm_add_epi8(src, alls);     \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
-  L = _mm_unpacklo_epi8(res, zero);                \
-} while (0)
-
-#define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
+#define DO_PRED12(DIFF, LANE, OUT)                          \
+do {                                                        \
+  const __m128i all = _mm_add_epi16(L, (DIFF));             \
+  const __m128i alls = _mm_packus_epi16(all, all);          \
+  const __m128i res = _mm_add_epi8(src, alls);              \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
+  L = _mm_unpacklo_epi8(res, zero);                         \
  /* Shift the pre-computed value for the next iteration.*/ \
  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
  src = _mm_srli_si128(src, 4);                             \
@@ -403,11 +377,8 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
    __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
    __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
    DO_PRED12(diff_lo, 0, 0);
-    DO_PRED12_SHIFT(diff_lo, 0);
    DO_PRED12(diff_lo, 1, 1);
-    DO_PRED12_SHIFT(diff_lo, 1);
    DO_PRED12(diff_hi, 0, 2);
-    DO_PRED12_SHIFT(diff_hi, 0);
    DO_PRED12(diff_hi, 1, 3);
  }
  if (i != num_pixels) {
@@ -415,7 +386,6 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
  }
 }
 #undef DO_PRED12
-#undef DO_PRED12_SHIFT

 // Due to averages with integers, values cannot be accumulated in parallel for
 // predictors 13.
@@ -522,24 +492,25 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,

 static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
-  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
-    const __m128i A1 = _mm_loadu_si128(in++);
-    const __m128i A2 = _mm_loadu_si128(in++);
-    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
-    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
-    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
-    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
-    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i F1 = _mm_or_si128(E1, C1);
-    const __m128i F2 = _mm_or_si128(E2, C2);
-    _mm_storeu_si128(out++, F1);
-    _mm_storeu_si128(out++, F2);
+    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
+    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
+    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
+    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
+    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
+    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
+    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
+    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
+    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
+    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
+    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
+    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
+    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
+    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
+    _mm_storeu_si128(out++, rgba0);
+    _mm_storeu_si128(out++, rgba4);
    num_pixels -= 8;
  }
  // left-overs
--- a/src/dsp/msa_macro.h
+++ b/src/dsp/msa_macro.h
@@ -22,7 +22,6 @@
 #endif

 #ifdef CLANG_BUILD
-  #define ALPHAVAL  (-1)
  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
@@ -33,7 +32,6 @@
  #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
  #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
-  #define ALPHAVAL  (0xff)
  #define ADDVI_H(a, b)  (a + b)
  #define ADDVI_W(a, b)  (a + b)
  #define SRAI_B(a, b)  (a >> b)
--- a/src/dsp/neon.h
+++ b/src/dsp/neon.h
@@ -17,9 +17,8 @@
 #include "./dsp.h"

 // Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except newer clang/gcc or aarch64 where the inline assembly is
-// incompatible.
-#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
+// everywhere except aarch64 where the inline assembly is incompatible.
+#if defined(__aarch64__)
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif

@@ -44,7 +43,7 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif

--- a/src/dsp/rescaler_mips32.c
+++ b/src/dsp/rescaler_mips32.c
@@ -279,7 +279,8 @@ extern void WebPRescalerDspInitMIPS32(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
  WebPRescalerImportRowExpand = ImportRowExpand;
-  WebPRescalerImportRowShrink = ImportRowShrink;
+  // WebPRescalerImportRowShrink = ImportRowShrink;
+  (void)ImportRowShrink;
  WebPRescalerExportRowExpand = ExportRowExpand;
  WebPRescalerExportRowShrink = ExportRowShrink;
 }
--- a/src/dsp/ssim.c
+++ b/src/dsp/ssim.c
@@ -1,151 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// distortion calculation
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include <stdlib.h>  // for abs()
-
-#include "./dsp.h"
-
-//------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
-  1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
-    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const uint32_t w2 =  N * N;
-  const uint32_t C1 = 20 * w2;
-  const uint32_t C2 = 60 * w2;
-  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
-  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
-  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
-  if (xmxm + ymym >= C3) {
-    const int64_t xmym = (int64_t)stats->xm * stats->ym;
-    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
-    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
-    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
-    // we descale by 8 to prevent overflow during the fnum/fden multiply.
-    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
-    const uint64_t den_S = (sxx + syy + C2) >> 8;
-    const uint64_t fnum = (2 * xmym + C1) * num_S;
-    const uint64_t fden = (xmxm + ymym + C1) * den_S;
-    const double r = (double)fnum / fden;
-    assert(r >= 0. && r <= 1.0);
-    return r;
-  }
-  return 1.;   // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
-                               const uint8_t* src2, int stride2,
-                               int xo, int yo, int W, int H) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
-  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
-                                                  : yo + VP8_SSIM_KERNEL;
-  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
-  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
-                                                  : xo + VP8_SSIM_KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
-                       * kWeight[VP8_SSIM_KERNEL + y - yo];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.w   += w;
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
-                        const uint8_t* src2, int stride2) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  int x, y;
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const uint32_t w = kWeight[x] * kWeight[y];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
-                              const uint8_t* src2, int len) {
-  int i;
-  uint32_t sse2 = 0;
-  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
-  for (i = 0; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8SSIMGetClipped = SSIMGetClipped_C;
-  VP8SSIMGet = SSIMGet_C;
-
-  VP8AccumulateSSE = AccumulateSSE;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SSIMDspInitSSE2();
-    }
-#endif
-  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
--- a/src/dsp/ssim_sse2.c
+++ b/src/dsp/ssim_sse2.c
@@ -1,154 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// SSE2 version of distortion calculation
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-
-#include "./common_sse2.h"
-
-// Helper function
-static WEBP_INLINE void SubtractAndSquare(const __m128i a, const __m128i b,
-                                          __m128i* const sum) {
-  // take abs(a-b) in 8b
-  const __m128i a_b = _mm_subs_epu8(a, b);
-  const __m128i b_a = _mm_subs_epu8(b, a);
-  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
-  // zero-extend to 16b
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
-  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
-  // multiply with self
-  const __m128i sum1 = _mm_madd_epi16(C0, C0);
-  const __m128i sum2 = _mm_madd_epi16(C1, C1);
-  *sum = _mm_add_epi32(sum1, sum2);
-}
-
-//------------------------------------------------------------------------------
-// SSIM / PSNR entry point
-
-static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
-                                   const uint8_t* src2, int len) {
-  int i = 0;
-  uint32_t sse2 = 0;
-  if (len >= 16) {
-    const int limit = len - 32;
-    int32_t tmp[4];
-    __m128i sum1;
-    __m128i sum = _mm_setzero_si128();
-    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-    i += 16;
-    while (i <= limit) {
-      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      __m128i sum2;
-      i += 16;
-      SubtractAndSquare(a0, b0, &sum1);
-      sum = _mm_add_epi32(sum, sum1);
-      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      i += 16;
-      SubtractAndSquare(a1, b1, &sum2);
-      sum = _mm_add_epi32(sum, sum2);
-    }
-    SubtractAndSquare(a0, b0, &sum1);
-    sum = _mm_add_epi32(sum, sum1);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-  }
-
-  for (; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-static uint32_t HorizontalAdd16b(const __m128i* const m) {
-  uint16_t tmp[8];
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi16(*m, a);
-  _mm_storeu_si128((__m128i*)tmp, b);
-  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
-}
-
-static uint32_t HorizontalAdd32b(const __m128i* const m) {
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi32(*m, a);
-  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
-  return (uint32_t)_mm_cvtsi128_si32(c);
-}
-
-static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
-
-#define ACCUMULATE_ROW(WEIGHT) do {                         \
-  /* compute row weight (Wx * Wy) */                        \
-  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
-  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
-  /* process 8 bytes at a time (7 bytes, actually) */       \
-  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
-  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
-  /* convert to 16b and multiply by weight */               \
-  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
-  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
-  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
-  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
-  /* accumulate */                                          \
-  xm  = _mm_add_epi16(xm, wa1);                             \
-  ym  = _mm_add_epi16(ym, wb1);                             \
-  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
-  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
-  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
-  src1 += stride1;                                          \
-  src2 += stride2;                                          \
-} while (0)
-
-static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2) {
-  VP8DistoStats stats;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i xm = zero, ym = zero;                // 16b accums
-  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
-  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
-  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
-  ACCUMULATE_ROW(1);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(4);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(1);
-  stats.xm  = HorizontalAdd16b(&xm);
-  stats.ym  = HorizontalAdd16b(&ym);
-  stats.xxm = HorizontalAdd32b(&xxm);
-  stats.xym = HorizontalAdd32b(&xym);
-  stats.yym = HorizontalAdd32b(&yym);
-  return VP8SSIMFromStats(&stats);
-}
-
-extern void VP8SSIMDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
-  VP8AccumulateSSE = AccumulateSSE_SSE2;
-  VP8SSIMGet = SSIMGet_SSE2;
-}
-
-#else  // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
-
-#endif  // WEBP_USE_SSE2
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@@ -93,13 +93,13 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }

 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair_C,  VP8YuvToRgb,  3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair_C,  VP8YuvToBgr,  3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair_C,  VP8YuvToRgb565,  2)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)

 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -161,13 +161,13 @@ void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }

-YUV444_FUNC(WebPYuv444ToRgb_C,      VP8YuvToRgb,  3)
-YUV444_FUNC(WebPYuv444ToBgr_C,      VP8YuvToBgr,  3)
-YUV444_FUNC(WebPYuv444ToRgba_C,     VP8YuvToRgba, 4)
-YUV444_FUNC(WebPYuv444ToBgra_C,     VP8YuvToBgra, 4)
-YUV444_FUNC(WebPYuv444ToArgb_C,     VP8YuvToArgb, 4)
-YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
-YUV444_FUNC(WebPYuv444ToRgb565_C,   VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgbC,      VP8YuvToRgb,  3)
+YUV444_FUNC(WebPYuv444ToBgrC,      VP8YuvToBgr,  3)
+YUV444_FUNC(WebPYuv444ToRgbaC,     VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgraC,     VP8YuvToBgra, 4)
+YUV444_FUNC(WebPYuv444ToArgbC,     VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565C,   VP8YuvToRgb565, 2)

 #undef YUV444_FUNC

@@ -182,17 +182,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;

-  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
-  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
-  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgr_C;
-  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
-  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgb_C;
-  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
-  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565_C;
-  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgba_C;
-  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgra_C;
-  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgb_C;
-  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
+  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgbC;
+  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgbaC;
+  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgrC;
+  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgraC;
+  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgbC;
+  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
+  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565C;
+  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgbaC;
+  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgraC;
+  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgbC;
+  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;

  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -224,17 +224,17 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;

 #ifdef FANCY_UPSAMPLING
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair_C;
-  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair_C;
-  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair_C;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair_C;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
-  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair_C;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair_C;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair_C;
-  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair_C;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
--- a/src/dsp/upsampling_msa.c
+++ b/src/dsp/upsampling_msa.c
@@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(R, G, B, A, dst);
@@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
 static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(B, G, R, A, dst);
@@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
  while (length >= 16) {
    CALC_RGB16(y, u, v, R, G, B);
    STORE16_4(A, R, G, B, dst);
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -121,10 +121,10 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],

 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                       top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32_SSE2(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);         \
+  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
  if (bottom_y != NULL) {                                                      \
-    FUNC##32_SSE2(bottom_y + (cur_x), r_u + 64, r_v + 64,                      \
-                  bottom_dst + (cur_x) * XSTEP);                               \
+    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
+             bottom_dst + (cur_x) * XSTEP);                                    \
  }                                                                            \
 } while (0)

@@ -213,40 +213,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);

-#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
-extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
-                   uint8_t* dst, int len);                                     \
+#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
+extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u,             \
+                               const uint8_t* v, uint8_t* dst, int len);       \
 static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
                      uint8_t* dst, int len) {                                 \
  int i;                                                                       \
  const int max_len = len & ~31;                                               \
  for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
  if (i < len) {  /* C-fallback */                                             \
-    CALL_C(y + i, u + i, v + i, dst + i * XSTEP, len - i);                     \
+    WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i);         \
  }                                                                            \
 }

-YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
-YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
-YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
-YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
-YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
-YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
-            WebPYuv444ToRgba4444_C, 2)
-YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
+YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
+YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
+YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
+YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
-  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba_SSE2;
-  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra_SSE2;
-  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE2;
-  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE2;
-  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb_SSE2;
-  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
-  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565_SSE2;
-  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba_SSE2;
-  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra_SSE2;
-  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb_SSE2;
-  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
+  WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_RGB]  = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]  = Yuv444ToBgr;
 }

 #else
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -308,9 +308,7 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;

 extern void WebPInitConvertARGBToYUVSSE2(void);
-extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
-extern void WebPInitSharpYUVNEON(void);

 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
  if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
@@ -334,13 +332,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_USE_SSE2
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitConvertARGBToYUVNEON();
-      WebPInitSharpYUVNEON();
-    }
-#endif  // WEBP_USE_NEON
-
  }
  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -166,20 +166,20 @@ void VP8YUVInit(void);
 #if defined(WEBP_USE_SSE2)

 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                        uint8_t* dst);
-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst);
-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst);
-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
-                             const uint8_t* v, uint8_t* dst);
-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                           uint8_t* dst);
+void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                      uint8_t* dst);

 #endif    // WEBP_USE_SSE2

--- a/src/dsp/yuv_neon.c
+++ b/src/dsp/yuv_neon.c
@@ -1,289 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// YUV->RGB conversion functions
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./yuv.h"
-
-#if defined(WEBP_USE_NEON)
-
-#include <assert.h>
-#include <stdlib.h>
-
-#include "./neon.h"
-
-//-----------------------------------------------------------------------------
-
-static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
-                                    const uint8x8_t G,
-                                    const uint8x8_t B) {
-  const uint16x8_t r = vmovl_u8(R);
-  const uint16x8_t g = vmovl_u8(G);
-  const uint16x8_t b = vmovl_u8(B);
-  const uint16x4_t r_lo = vget_low_u16(r);
-  const uint16x4_t r_hi = vget_high_u16(r);
-  const uint16x4_t g_lo = vget_low_u16(g);
-  const uint16x4_t g_hi = vget_high_u16(g);
-  const uint16x4_t b_lo = vget_low_u16(b);
-  const uint16x4_t b_hi = vget_high_u16(b);
-  const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
-  const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
-  const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
-  const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
-  const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
-  const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
-  const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
-                                     vrshrn_n_u32(tmp2_hi, 16));
-  const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
-  return vqmovn_u16(Y2);
-}
-
-static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
-  int i;
-  for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
-    const uint8x8x3_t RGB = vld3_u8(rgb);
-    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
-    vst1_u8(y + i, Y);
-  }
-  for (; i < width; ++i, rgb += 3) {   // left-over
-    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
-  }
-}
-
-static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
-  int i;
-  for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
-    const uint8x8x3_t BGR = vld3_u8(bgr);
-    const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
-    vst1_u8(y + i, Y);
-  }
-  for (; i < width; ++i, bgr += 3) {  // left-over
-    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
-  }
-}
-
-static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
-  int i;
-  for (i = 0; i + 8 <= width; i += 8) {
-    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
-    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
-    vst1_u8(y + i, Y);
-  }
-  for (; i < width; ++i) {   // left-over
-    const uint32_t p = argb[i];
-    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
-                     YUV_HALF);
-  }
-}
-
-//-----------------------------------------------------------------------------
-
-// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
-#define MULTIPLY_16b_PREAMBLE(r, g, b)                           \
-  const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r));  \
-  const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
-  const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g));  \
-  const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
-  const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b));  \
-  const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
-
-#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do {              \
-  const int32x4_t tmp0_lo = vmull_n_s16(         r_lo, C0);      \
-  const int32x4_t tmp0_hi = vmull_n_s16(         r_hi, C0);      \
-  const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1);      \
-  const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1);      \
-  const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2);      \
-  const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2);      \
-  const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16),  \
-                                      vshrn_n_s32(tmp2_hi, 16)); \
-  DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST));                   \
-} while (0)
-
-// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
-#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do {     \
-  MULTIPLY_16b_PREAMBLE(r, g, b);                                \
-  MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST);       \
-  MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
-} while (0)
-
-static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
-                                   uint8_t* u, uint8_t* v, int width) {
-  int i;
-  for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
-    const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
-    int16x8_t U, V;
-    CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
-    vst1_u8(u + i, vqrshrun_n_s16(U, 2));
-    vst1_u8(v + i, vqrshrun_n_s16(V, 2));
-  }
-  for (; i < width; i += 1, rgb += 4) {
-    const int r = rgb[0], g = rgb[1], b = rgb[2];
-    u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
-    v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
-  }
-}
-
-static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
-                                 int src_width, int do_store) {
-  int i;
-  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
-    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
-    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
-    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
-    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
-    int16x8_t U_tmp, V_tmp;
-    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
-    {
-      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
-      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
-      if (do_store) {
-        vst1_u8(u, U);
-        vst1_u8(v, V);
-      } else {
-        const uint8x8_t prev_u = vld1_u8(u);
-        const uint8x8_t prev_v = vld1_u8(v);
-        vst1_u8(u, vrhadd_u8(U, prev_u));
-        vst1_u8(v, vrhadd_u8(V, prev_v));
-      }
-    }
-  }
-  if (i < src_width) {  // left-over
-    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
-  }
-}
-
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitConvertARGBToYUVNEON(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
-  WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
-  WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
-  WebPConvertARGBToY = ConvertARGBToY_NEON;
-  WebPConvertARGBToUV = ConvertARGBToUV_NEON;
-  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
-}
-
-//------------------------------------------------------------------------------
-
-#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
-static uint16_t clip_y(int v) {
-  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
-}
-
-static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
-                                     uint16_t* dst, int len) {
-  int i;
-  const int16x8_t zero = vdupq_n_s16(0);
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  uint64x2_t sum = vdupq_n_u64(0);
-  uint64_t diff;
-
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
-    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
-    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
-    const int16x8_t D = vsubq_s16(A, B);       // diff_y
-    const int16x8_t F = vaddq_s16(C, D);       // new_y
-    const uint16x8_t H =
-        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
-    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
-    vst1q_u16(dst + i, H);
-    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
-  }
-  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
-  for (; i < len; ++i) {
-    const int diff_y = ref[i] - src[i];
-    const int new_y = (int)(dst[i]) + diff_y;
-    dst[i] = clip_y(new_y);
-    diff += (uint64_t)(abs(diff_y));
-  }
-  return diff;
-}
-
-static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
-                                   int16_t* dst, int len) {
-  int i;
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t A = vld1q_s16(ref + i);
-    const int16x8_t B = vld1q_s16(src + i);
-    const int16x8_t C = vld1q_s16(dst + i);
-    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
-    const int16x8_t E = vaddq_s16(C, D);   // new_uv
-    vst1q_s16(dst + i, E);
-  }
-  for (; i < len; ++i) {
-    const int diff_uv = ref[i] - src[i];
-    dst[i] += diff_uv;
-  }
-}
-
-static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
-                                   const uint16_t* best_y, uint16_t* out) {
-  int i;
-  const int16x8_t max = vdupq_n_s16(MAX_Y);
-  const int16x8_t zero = vdupq_n_s16(0);
-  for (i = 0; i + 8 <= len; i += 8) {
-    const int16x8_t a0 = vld1q_s16(A + i + 0);
-    const int16x8_t a1 = vld1q_s16(A + i + 1);
-    const int16x8_t b0 = vld1q_s16(B + i + 0);
-    const int16x8_t b1 = vld1q_s16(B + i + 1);
-    const int16x8_t a0b1 = vaddq_s16(a0, b1);
-    const int16x8_t a1b0 = vaddq_s16(a1, b0);
-    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
-    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
-    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
-    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
-    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
-    const int16x8_t d0 = vaddq_s16(c1, a0);
-    const int16x8_t d1 = vaddq_s16(c0, a1);
-    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
-    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
-    const int16x8x2_t f = vzipq_s16(e0, e1);
-    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
-    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
-    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
-    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
-    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
-    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
-    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
-    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
-  }
-  for (; i < len; ++i) {
-    const int a0b1 = A[i + 0] + B[i + 1];
-    const int a1b0 = A[i + 1] + B[i + 0];
-    const int a0a1b0b1 = a0b1 + a1b0 + 8;
-    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
-    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
-    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
-    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
-  }
-}
-#undef MAX_Y
-
-//------------------------------------------------------------------------------
-
-extern void WebPInitSharpYUVNEON(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
-  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
-  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
-  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
-}
-
-#else  // !WEBP_USE_NEON
-
-WEBP_DSP_INIT_STUB(WebPInitSamplersNEON)
-WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
-WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
-
-#endif  // WEBP_USE_NEON
--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@@ -186,8 +186,8 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }

-void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -197,8 +197,8 @@ void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -208,8 +208,8 @@ void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst) {
+void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 32) {
@@ -219,8 +219,8 @@ void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
-                             const uint8_t* v, uint8_t* dst) {
+void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
  const __m128i kAlpha = _mm_set1_epi16(255);
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
@@ -230,8 +230,8 @@ void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
  }
 }

-void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                           uint8_t* dst) {
+void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                      uint8_t* dst) {
  int n;
  for (n = 0; n < 32; n += 8, dst += 16) {
    __m128i R, G, B;
@@ -240,8 +240,8 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  }
 }

-void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;

@@ -262,8 +262,8 @@ void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }

-void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;

--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@@ -3,7 +3,6 @@ noinst_LTLIBRARIES = libwebpencode.la
 libwebpencode_la_SOURCES =
 libwebpencode_la_SOURCES += alpha_enc.c
 libwebpencode_la_SOURCES += analysis_enc.c
-libwebpencode_la_SOURCES += backward_references_cost_enc.c
 libwebpencode_la_SOURCES += backward_references_enc.c
 libwebpencode_la_SOURCES += backward_references_enc.h
 libwebpencode_la_SOURCES += config_enc.c
--- a/src/enc/backward_references_cost_enc.c
+++ b/src/enc/backward_references_cost_enc.c
@@ -1,790 +0,0 @@
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Improves a given set of backward references by analyzing its bit cost.
-// The algorithm is similar to the Zopfli compression algorithm but tailored to
-// images.
-//
-// Author: Vincent Rabaud (vrabaud@google.com)
-//
-
-#include <assert.h>
-
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../dsp/lossless_common.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/utils.h"
-
-#define VALUES_IN_BYTE 256
-
-extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
-extern int VP8LDistanceToPlaneCode(int xsize, int dist);
-extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
-                                      const PixOrCopy v);
-
-typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
-} CostModel;
-
-static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
-  uint32_t sum = 0;
-  int nonzeros = 0;
-  int i;
-  for (i = 0; i < num_symbols; ++i) {
-    sum += population_counts[i];
-    if (population_counts[i] > 0) {
-      ++nonzeros;
-    }
-  }
-  if (nonzeros <= 1) {
-    memset(output, 0, num_symbols * sizeof(*output));
-  } else {
-    const double logsum = VP8LFastLog2(sum);
-    for (i = 0; i < num_symbols; ++i) {
-      output[i] = logsum - VP8LFastLog2(population_counts[i]);
-    }
-  }
-}
-
-static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
-                          const VP8LBackwardRefs* const refs) {
-  int ok = 0;
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
-
-  // The following code is similar to VP8LHistogramCreate but converts the
-  // distance to plane code.
-  VP8LHistogramInit(histo, cache_bits);
-  while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
-                                    xsize);
-    VP8LRefsCursorNext(&c);
-  }
-
-  ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->red_, m->red_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->blue_, m->blue_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
-  ConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
-  ok = 1;
-
- Error:
-  VP8LFreeHistogram(histo);
-  return ok;
-}
-
-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
-  return m->alpha_[v >> 24] +
-         m->red_[(v >> 16) & 0xff] +
-         m->literal_[(v >> 8) & 0xff] +
-         m->blue_[v & 0xff];
-}
-
-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
-  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return m->literal_[literal_idx];
-}
-
-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
-                                        uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
-}
-
-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
-                                          uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
-}
-
-static WEBP_INLINE void AddSingleLiteralWithCostModel(
-    const uint32_t* const argb, VP8LColorCache* const hashers,
-    const CostModel* const cost_model, int idx, int use_color_cache,
-    float prev_cost, float* const cost, uint16_t* const dist_array) {
-  double cost_val = prev_cost;
-  const uint32_t color = argb[idx];
-  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
-  if (ix >= 0) {
-    // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
-    cost_val += GetCacheCost(cost_model, ix) * mul0;
-  } else {
-    const double mul1 = 0.82;
-    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
-    cost_val += GetLiteralCost(cost_model, color) * mul1;
-  }
-  if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
-    dist_array[idx] = 1;  // only one is inserted.
-  }
-}
-
-// -----------------------------------------------------------------------------
-// CostManager and interval handling
-
-// Empirical value to avoid high memory consumption but good for performance.
-#define COST_CACHE_INTERVAL_SIZE_MAX 500
-
-// To perform backward reference every pixel at index index_ is considered and
-// the cost for the MAX_LENGTH following pixels computed. Those following pixels
-// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
-//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
-// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
-// array of size MAX_LENGTH.
-// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
-// minimal values using intervals of constant cost.
-// An interval is defined by the index_ of the pixel that generated it and
-// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
-// it contains the minimum value for pixels between start_ and end_.
-// Intervals are stored in a linked list and ordered by start_. When a new
-// interval has a better value, old intervals are split or removed. There are
-// therefore no overlapping intervals.
-typedef struct CostInterval CostInterval;
-struct CostInterval {
-  float cost_;
-  int start_;
-  int end_;
-  int index_;
-  CostInterval* previous_;
-  CostInterval* next_;
-};
-
-// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
-typedef struct {
-  double cost_;
-  int start_;
-  int end_;       // Exclusive.
-} CostCacheInterval;
-
-// This structure is in charge of managing intervals and costs.
-// It caches the different CostCacheInterval, caches the different
-// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
-// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
-#define COST_MANAGER_MAX_FREE_LIST 10
-typedef struct {
-  CostInterval* head_;
-  int count_;  // The number of stored intervals.
-  CostCacheInterval* cache_intervals_;
-  size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
-  float* costs_;
-  uint16_t* dist_array_;
-  // Most of the time, we only need few intervals -> use a free-list, to avoid
-  // fragmentation with small allocs in most common cases.
-  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
-  CostInterval* free_intervals_;
-  // These are regularly malloc'd remains. This list can't grow larger than than
-  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
-  CostInterval* recycled_intervals_;
-} CostManager;
-
-static void CostIntervalAddToFreeList(CostManager* const manager,
-                                      CostInterval* const interval) {
-  interval->next_ = manager->free_intervals_;
-  manager->free_intervals_ = interval;
-}
-
-static int CostIntervalIsInFreeList(const CostManager* const manager,
-                                    const CostInterval* const interval) {
-  return (interval >= &manager->intervals_[0] &&
-          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
-}
-
-static void CostManagerInitFreeList(CostManager* const manager) {
-  int i;
-  manager->free_intervals_ = NULL;
-  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
-    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
-  }
-}
-
-static void DeleteIntervalList(CostManager* const manager,
-                               const CostInterval* interval) {
-  while (interval != NULL) {
-    const CostInterval* const next = interval->next_;
-    if (!CostIntervalIsInFreeList(manager, interval)) {
-      WebPSafeFree((void*)interval);
-    }  // else: do nothing
-    interval = next;
-  }
-}
-
-static void CostManagerClear(CostManager* const manager) {
-  if (manager == NULL) return;
-
-  WebPSafeFree(manager->costs_);
-  WebPSafeFree(manager->cache_intervals_);
-
-  // Clear the interval lists.
-  DeleteIntervalList(manager, manager->head_);
-  manager->head_ = NULL;
-  DeleteIntervalList(manager, manager->recycled_intervals_);
-  manager->recycled_intervals_ = NULL;
-
-  // Reset pointers, count_ and cache_intervals_size_.
-  memset(manager, 0, sizeof(*manager));
-  CostManagerInitFreeList(manager);
-}
-
-static int CostManagerInit(CostManager* const manager,
-                           uint16_t* const dist_array, int pix_count,
-                           const CostModel* const cost_model) {
-  int i;
-  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
-
-  manager->costs_ = NULL;
-  manager->cache_intervals_ = NULL;
-  manager->head_ = NULL;
-  manager->recycled_intervals_ = NULL;
-  manager->count_ = 0;
-  manager->dist_array_ = dist_array;
-  CostManagerInitFreeList(manager);
-
-  // Fill in the cost_cache_.
-  manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
-  for (i = 1; i < cost_cache_size; ++i) {
-    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
-    // Get the number of bound intervals.
-    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
-      ++manager->cache_intervals_size_;
-    }
-  }
-
-  // With the current cost model, we usually have below 20 intervals.
-  // The worst case scenario with a cost model would be if every length has a
-  // different cost, hence MAX_LENGTH but that is impossible with the current
-  // implementation that spirals around a pixel.
-  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
-  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
-      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
-  if (manager->cache_intervals_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-
-  // Fill in the cache_intervals_.
-  {
-    CostCacheInterval* cur = manager->cache_intervals_;
-
-    // Consecutive values in cost_cache_ are compared and if a big enough
-    // difference is found, a new interval is created and bounded.
-    cur->start_ = 0;
-    cur->end_ = 1;
-    cur->cost_ = manager->cost_cache_[0];
-    for (i = 1; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
-      if (cost_val != cur->cost_) {
-        ++cur;
-        // Initialize an interval.
-        cur->start_ = i;
-        cur->cost_ = cost_val;
-      }
-      cur->end_ = i + 1;
-    }
-  }
-
-  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
-  if (manager->costs_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-  // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
-
-  return 1;
-}
-
-// Given the cost and the position that define an interval, update the cost at
-// pixel 'i' if it is smaller than the previously computed value.
-static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
-                                   int position, float cost) {
-  const int k = i - position;
-  assert(k >= 0 && k < MAX_LENGTH);
-
-  if (manager->costs_[i] > cost) {
-    manager->costs_[i] = cost;
-    manager->dist_array_[i] = k + 1;
-  }
-}
-
-// Given the cost and the position that define an interval, update the cost for
-// all the pixels between 'start' and 'end' excluded.
-static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
-                                              int start, int end, int position,
-                                              float cost) {
-  int i;
-  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
-}
-
-// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
-static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
-                                         CostInterval* const prev,
-                                         CostInterval* const next) {
-  if (prev != NULL) {
-    prev->next_ = next;
-  } else {
-    manager->head_ = next;
-  }
-
-  if (next != NULL) next->previous_ = prev;
-}
-
-// Pop an interval in the manager.
-static WEBP_INLINE void PopInterval(CostManager* const manager,
-                                    CostInterval* const interval) {
-  if (interval == NULL) return;
-
-  ConnectIntervals(manager, interval->previous_, interval->next_);
-  if (CostIntervalIsInFreeList(manager, interval)) {
-    CostIntervalAddToFreeList(manager, interval);
-  } else {  // recycle regularly malloc'd intervals too
-    interval->next_ = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval;
-  }
-  --manager->count_;
-  assert(manager->count_ >= 0);
-}
-
-// Update the cost at index i by going over all the stored intervals that
-// overlap with i.
-// If 'do_clean_intervals' is set to something different than 0, intervals that
-// end before 'i' will be popped.
-static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
-                                          int do_clean_intervals) {
-  CostInterval* current = manager->head_;
-
-  while (current != NULL && current->start_ <= i) {
-    CostInterval* const next = current->next_;
-    if (current->end_ <= i) {
-      if (do_clean_intervals) {
-        // We have an outdated interval, remove it.
-        PopInterval(manager, current);
-      }
-    } else {
-      UpdateCost(manager, i, current->index_, current->cost_);
-    }
-    current = next;
-  }
-}
-
-// Given a current orphan interval and its previous interval, before
-// it was orphaned (which can be NULL), set it at the right place in the list
-// of intervals using the start_ ordering and the previous interval as a hint.
-static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
-                                               CostInterval* const current,
-                                               CostInterval* previous) {
-  assert(current != NULL);
-
-  if (previous == NULL) previous = manager->head_;
-  while (previous != NULL && current->start_ < previous->start_) {
-    previous = previous->previous_;
-  }
-  while (previous != NULL && previous->next_ != NULL &&
-         previous->next_->start_ < current->start_) {
-    previous = previous->next_;
-  }
-
-  if (previous != NULL) {
-    ConnectIntervals(manager, current, previous->next_);
-  } else {
-    ConnectIntervals(manager, current, manager->head_);
-  }
-  ConnectIntervals(manager, previous, current);
-}
-
-// Insert an interval in the list contained in the manager by starting at
-// interval_in as a hint. The intervals are sorted by start_ value.
-static WEBP_INLINE void InsertInterval(CostManager* const manager,
-                                       CostInterval* const interval_in,
-                                       float cost, int position, int start,
-                                       int end) {
-  CostInterval* interval_new;
-
-  if (start >= end) return;
-  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
-    // Serialize the interval if we cannot store it.
-    UpdateCostPerInterval(manager, start, end, position, cost);
-    return;
-  }
-  if (manager->free_intervals_ != NULL) {
-    interval_new = manager->free_intervals_;
-    manager->free_intervals_ = interval_new->next_;
-  } else if (manager->recycled_intervals_ != NULL) {
-    interval_new = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval_new->next_;
-  } else {  // malloc for good
-    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
-    if (interval_new == NULL) {
-      // Write down the interval if we cannot create it.
-      UpdateCostPerInterval(manager, start, end, position, cost);
-      return;
-    }
-  }
-
-  interval_new->cost_ = cost;
-  interval_new->index_ = position;
-  interval_new->start_ = start;
-  interval_new->end_ = end;
-  PositionOrphanInterval(manager, interval_new, interval_in);
-
-  ++manager->count_;
-}
-
-// Given a new cost interval defined by its start at position, its length value
-// and distance_cost, add its contributions to the previous intervals and costs.
-// If handling the interval or one of its subintervals becomes to heavy, its
-// contribution is added to the costs right away.
-static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int position,
-                                     int len) {
-  size_t i;
-  CostInterval* interval = manager->head_;
-  CostInterval* interval_next;
-  const CostCacheInterval* const cost_cache_intervals =
-      manager->cache_intervals_;
-  // If the interval is small enough, no need to deal with the heavy
-  // interval logic, just serialize it right away. This constant is empirical.
-  const int kSkipDistance = 10;
-
-  if (len < kSkipDistance) {
-    int j;
-    for (j = position; j < position + len; ++j) {
-      const int k = j - position;
-      float cost_tmp;
-      assert(k >= 0 && k < MAX_LENGTH);
-      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
-
-      if (manager->costs_[j] > cost_tmp) {
-        manager->costs_[j] = cost_tmp;
-        manager->dist_array_[j] = k + 1;
-      }
-    }
-    return;
-  }
-
-  for (i = 0; i < manager->cache_intervals_size_ &&
-              cost_cache_intervals[i].start_ < len;
-       ++i) {
-    // Define the intersection of the ith interval with the new one.
-    int start = position + cost_cache_intervals[i].start_;
-    const int end = position + (cost_cache_intervals[i].end_ > len
-                                 ? len
-                                 : cost_cache_intervals[i].end_);
-    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
-
-    for (; interval != NULL && interval->start_ < end;
-         interval = interval_next) {
-      interval_next = interval->next_;
-
-      // Make sure we have some overlap
-      if (start >= interval->end_) continue;
-
-      if (cost >= interval->cost_) {
-        // When intervals are represented, the lower, the better.
-        // [**********************************************************[
-        // start                                                    end
-        //                   [----------------------------------[
-        //                   interval->start_       interval->end_
-        // If we are worse than what we already have, add whatever we have so
-        // far up to interval.
-        const int start_new = interval->end_;
-        InsertInterval(manager, interval, cost, position, start,
-                       interval->start_);
-        start = start_new;
-        if (start >= end) break;
-        continue;
-      }
-
-      if (start <= interval->start_) {
-        if (interval->end_ <= end) {
-          //                   [----------------------------------[
-          //                   interval->start_       interval->end_
-          // [**************************************************************[
-          // start                                                        end
-          // We can safely remove the old interval as it is fully included.
-          PopInterval(manager, interval);
-        } else {
-          //              [------------------------------------[
-          //              interval->start_        interval->end_
-          // [*****************************[
-          // start                       end
-          interval->start_ = end;
-          break;
-        }
-      } else {
-        if (end < interval->end_) {
-          // [--------------------------------------------------------------[
-          // interval->start_                                  interval->end_
-          //                     [*****************************[
-          //                     start                       end
-          // We have to split the old interval as it fully contains the new one.
-          const int end_original = interval->end_;
-          interval->end_ = start;
-          InsertInterval(manager, interval, interval->cost_, interval->index_,
-                         end, end_original);
-          interval = interval->next_;
-          break;
-        } else {
-          // [------------------------------------[
-          // interval->start_        interval->end_
-          //                     [*****************************[
-          //                     start                       end
-          interval->end_ = start;
-        }
-      }
-    }
-    // Insert the remaining interval from start to end.
-    InsertInterval(manager, interval, cost, position, start, end);
-  }
-}
-
-static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
-    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
-    uint16_t* const dist_array) {
-  int i;
-  int ok = 0;
-  int cc_init = 0;
-  const int pix_count = xsize * ysize;
-  const int use_color_cache = (cache_bits > 0);
-  const size_t literal_array_size =
-      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
-  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
-  CostModel* const cost_model =
-      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
-  VP8LColorCache hashers;
-  CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
-  int offset_prev = -1, len_prev = -1;
-  double offset_cost = -1;
-  int first_offset_is_constant = -1;  // initialized with 'impossible' value
-  int reach = 0;
-
-  if (cost_model == NULL || cost_manager == NULL) goto Error;
-
-  cost_model->literal_ = (double*)(cost_model + 1);
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
-    goto Error;
-  }
-
-  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
-    goto Error;
-  }
-
-  // We loop one pixel at a time, but store all currently best points to
-  // non-processed locations from this point.
-  dist_array[0] = 0;
-  // Add first pixel as literal.
-  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
-                                0.f, cost_manager->costs_, dist_array);
-
-  for (i = 1; i < pix_count; ++i) {
-    const float prev_cost = cost_manager->costs_[i - 1];
-    int offset, len;
-    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
-
-    // Try adding the pixel as a literal.
-    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
-                                  use_color_cache, prev_cost,
-                                  cost_manager->costs_, dist_array);
-
-    // If we are dealing with a non-literal.
-    if (len >= 2) {
-      if (offset != offset_prev) {
-        const int code = VP8LDistanceToPlaneCode(xsize, offset);
-        offset_cost = GetDistanceCost(cost_model, code);
-        first_offset_is_constant = 1;
-        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
-      } else {
-        assert(offset_cost >= 0);
-        assert(len_prev >= 0);
-        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
-        // Instead of considering all contributions from a pixel i by calling:
-        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
-        // we optimize these contributions in case offset_cost stays the same
-        // for consecutive pixels. This describes a set of pixels similar to a
-        // previous set (e.g. constant color regions).
-        if (first_offset_is_constant) {
-          reach = i - 1 + len_prev - 1;
-          first_offset_is_constant = 0;
-        }
-
-        if (i + len - 1 > reach) {
-          // We can only be go further with the same offset if the previous
-          // length was maxed, hence len_prev == len == MAX_LENGTH.
-          // TODO(vrabaud), bump i to the end right away (insert cache and
-          // update cost).
-          // TODO(vrabaud), check if one of the points in between does not have
-          // a lower cost.
-          // Already consider the pixel at "reach" to add intervals that are
-          // better than whatever we add.
-          int offset_j, len_j = 0;
-          int j;
-          assert(len == MAX_LENGTH || len == pix_count - i);
-          // Figure out the last consecutive pixel within [i, reach + 1] with
-          // the same offset.
-          for (j = i; j <= reach; ++j) {
-            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
-            if (offset_j != offset) {
-              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
-              break;
-            }
-          }
-          // Update the cost at j - 1 and j.
-          UpdateCostAtIndex(cost_manager, j - 1, 0);
-          UpdateCostAtIndex(cost_manager, j, 0);
-
-          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
-                       j, len_j);
-          reach = j + len_j - 1;
-        }
-      }
-    }
-
-    UpdateCostAtIndex(cost_manager, i, 1);
-    offset_prev = offset;
-    len_prev = len;
-  }
-
-  ok = !refs->error_;
-Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  CostManagerClear(cost_manager);
-  WebPSafeFree(cost_model);
-  WebPSafeFree(cost_manager);
-  return ok;
-}
-
-// We pack the path at the end of *dist_array and return
-// a pointer to this part of the array. Example:
-// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint16_t* const dist_array,
-                           int dist_array_size,
-                           uint16_t** const chosen_path,
-                           int* const chosen_path_size) {
-  uint16_t* path = dist_array + dist_array_size;
-  uint16_t* cur = dist_array + dist_array_size - 1;
-  while (cur >= dist_array) {
-    const int k = *cur;
-    --path;
-    *path = k;
-    cur -= k;
-  }
-  *chosen_path = path;
-  *chosen_path_size = (int)(dist_array + dist_array_size - path);
-}
-
-static int BackwardReferencesHashChainFollowChosenPath(
-    const uint32_t* const argb, int cache_bits,
-    const uint16_t* const chosen_path, int chosen_path_size,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
-  const int use_color_cache = (cache_bits > 0);
-  int ix;
-  int i = 0;
-  int ok = 0;
-  int cc_init = 0;
-  VP8LColorCache hashers;
-
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  VP8LClearBackwardRefs(refs);
-  for (ix = 0; ix < chosen_path_size; ++ix) {
-    const int len = chosen_path[ix];
-    if (len != 1) {
-      int k;
-      const int offset = VP8LHashChainFindOffset(hash_chain, i);
-      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (k = 0; k < len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      i += len;
-    } else {
-      PixOrCopy v;
-      const int idx =
-          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
-      if (idx >= 0) {
-        // use_color_cache is true and hashers contains argb[i]
-        // push pixel as a color cache index
-        v = PixOrCopyCreateCacheIdx(idx);
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        v = PixOrCopyCreateLiteral(argb[i]);
-      }
-      VP8LBackwardRefsCursorAdd(refs, v);
-      ++i;
-    }
-  }
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  return ok;
-}
-
-// Returns 1 on success.
-extern int VP8LBackwardReferencesTraceBackwards(
-    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
-    const VP8LHashChain* const hash_chain,
-    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
-int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
-                                         const uint32_t* const argb,
-                                         int cache_bits,
-                                         const VP8LHashChain* const hash_chain,
-                                         const VP8LBackwardRefs* const refs_src,
-                                         VP8LBackwardRefs* const refs_dst) {
-  int ok = 0;
-  const int dist_array_size = xsize * ysize;
-  uint16_t* chosen_path = NULL;
-  int chosen_path_size = 0;
-  uint16_t* dist_array =
-      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
-
-  if (dist_array == NULL) goto Error;
-
-  if (!BackwardReferencesHashChainDistanceOnly(
-          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
-    goto Error;
-  }
-  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
-  if (!BackwardReferencesHashChainFollowChosenPath(
-          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
-          refs_dst)) {
-    goto Error;
-  }
-  ok = 1;
- Error:
-  WebPSafeFree(dist_array);
-  return ok;
-}
--- a/src/enc/backward_references_enc.c
+++ b/src/enc/backward_references_enc.c
--- a/src/enc/backward_references_enc.h
+++ b/src/enc/backward_references_enc.h
@@ -113,15 +113,6 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 #define HASH_BITS 18
 #define HASH_SIZE (1 << HASH_BITS)

-// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
-// is used in VP8LHashChain.
-#define MAX_LENGTH_BITS 12
-// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
-#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
-#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
-#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
-#endif
-
 typedef struct VP8LHashChain VP8LHashChain;
 struct VP8LHashChain {
  // The 20 most significant bits contain the offset at which the best match
@@ -143,24 +134,6 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                      int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory

-static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
-                                               const int base_position) {
-  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
-}
-
-static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
-                                               const int base_position) {
-  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
-}
-
-static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
-                                              int base_position,
-                                              int* const offset_ptr,
-                                              int* const length_ptr) {
-  *offset_ptr = VP8LHashChainFindOffset(p, base_position);
-  *length_ptr = VP8LHashChainFindLength(p, base_position);
-}
-
 // -----------------------------------------------------------------------------
 // VP8LBackwardRefs (block-based backward-references storage)

@@ -185,6 +158,9 @@ struct VP8LBackwardRefs {
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
 // Release memory for backward references.
 void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
+// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
+int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
+                         VP8LBackwardRefs* const dst);

 // Cursor for iterating on references content
 typedef struct {
@@ -213,12 +189,6 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // -----------------------------------------------------------------------------
 // Main entry points

-enum VP8LLZ77Type {
-  kLZ77Standard = 1,
-  kLZ77RLE = 2,
-  kLZ77Box = 4
-};
-
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
@@ -227,9 +197,8 @@ enum VP8LLZ77Type {
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);

 #ifdef __cplusplus
 }
--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@@ -76,7 +76,7 @@ void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                            VP8LHistogram* const histo) {
  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
  while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
    VP8LRefsCursorNext(&c);
  }
 }
@@ -138,9 +138,7 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
 // -----------------------------------------------------------------------------

 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v,
-                                     int (*const distance_modifier)(int, int),
-                                     int distance_modifier_arg0) {
+                                     const PixOrCopy* const v) {
  if (PixOrCopyIsLiteral(v)) {
    ++histo->alpha_[PixOrCopyLiteral(v, 3)];
    ++histo->red_[PixOrCopyLiteral(v, 2)];
@@ -154,13 +152,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
    int code, extra_bits;
    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
    ++histo->literal_[NUM_LITERAL_CODES + code];
-    if (distance_modifier == NULL) {
-      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
-    } else {
-      VP8LPrefixEncodeBits(
-          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
-          &code, &extra_bits);
-    }
+    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
    ++histo->distance_[code];
  }
 }
@@ -481,7 +473,7 @@ static void HistogramBuild(
  while (VP8LRefsCursorOk(&c)) {
    const PixOrCopy* const v = c.cur_pos;
    const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
-    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
    x += PixOrCopyLength(v);
    while (x >= xsize) {
      x -= xsize;
@@ -531,12 +523,11 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,

 // Compact image_histo[] by merging some histograms with same bin_id together if
 // it's advantageous.
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* cur_combo,
-                                       const uint16_t* const bin_map,
-                                       int bin_map_size, int num_bins,
-                                       double combine_cost_factor,
-                                       int low_effort) {
+static VP8LHistogram* HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo,
+    VP8LHistogram* cur_combo,
+    const uint16_t* const bin_map, int bin_map_size, int num_bins,
+    double combine_cost_factor, int low_effort) {
  VP8LHistogram** const histograms = image_histo->histograms;
  int idx;
  // Work in-place: processed histograms are put at the beginning of
@@ -602,13 +593,14 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
      UpdateHistogramCost(histograms[idx]);
    }
  }
+  return cur_combo;
 }

-// Implement a Lehmer random number generator with a multiplicative constant of
-// 48271 and a modulo constant of 2^31 − 1.
 static uint32_t MyRand(uint32_t* const seed) {
-  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
-  assert(*seed > 0);
+  *seed = (*seed * 16807ull) & 0xffffffffu;
+  if (*seed == 0) {
+    *seed = 1;
+  }
  return *seed;
 }

@@ -649,75 +641,57 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
 static void HistoQueueClear(HistoQueue* const histo_queue) {
  assert(histo_queue != NULL);
  WebPSafeFree(histo_queue->queue);
-  histo_queue->size = 0;
-  histo_queue->max_size = 0;
 }

-// Pop a specific pair in the queue by replacing it with the last one
-// and shrinking the queue.
-static void HistoQueuePopPair(HistoQueue* const histo_queue,
-                              HistogramPair* const pair) {
-  assert(pair >= histo_queue->queue &&
-         pair < (histo_queue->queue + histo_queue->size));
-  assert(histo_queue->size > 0);
-  *pair = histo_queue->queue[histo_queue->size - 1];
-  --histo_queue->size;
+static void SwapHistogramPairs(HistogramPair *p1,
+                               HistogramPair *p2) {
+  const HistogramPair tmp = *p1;
+  *p1 = *p2;
+  *p2 = tmp;
 }

-// Check whether a pair in the queue should be updated as head or not.
-static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
-                                 HistogramPair* const pair) {
-  assert(pair->cost_diff < 0.);
-  assert(pair >= histo_queue->queue &&
-         pair < (histo_queue->queue + histo_queue->size));
-  assert(histo_queue->size > 0);
-  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
-    // Replace the best pair.
-    const HistogramPair tmp = histo_queue->queue[0];
-    histo_queue->queue[0] = *pair;
-    *pair = tmp;
+// Given a valid priority queue in range [0, queue_size) this function checks
+// whether histo_queue[queue_size] should be accepted and swaps it with the
+// front if it is smaller. Otherwise, it leaves it as is.
+static void UpdateQueueFront(HistoQueue* const histo_queue) {
+  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
+
+  if (histo_queue->queue[histo_queue->size].cost_diff <
+      histo_queue->queue[0].cost_diff) {
+    SwapHistogramPairs(histo_queue->queue,
+                       histo_queue->queue + histo_queue->size);
  }
+  ++histo_queue->size;
+
+  // We cannot add more elements than the capacity.
+  // The allocation adds an extra element to the official capacity so that
+  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
+  assert(histo_queue->size <= histo_queue->max_size);
 }

-// Create a pair from indices "idx1" and "idx2" provided its cost
-// is inferior to "threshold", a negative entropy.
-// It returns the cost of the pair, or 0. if it superior to threshold.
-static double HistoQueuePush(HistoQueue* const histo_queue,
-                             VP8LHistogram** const histograms, int idx1,
-                             int idx2, double threshold) {
-  const VP8LHistogram* h1;
-  const VP8LHistogram* h2;
-  HistogramPair pair;
+// -----------------------------------------------------------------------------
+
+static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
+                        HistogramPair* const pair) {
+  VP8LHistogram* h1;
+  VP8LHistogram* h2;
  double sum_cost;

-  assert(threshold <= 0.);
  if (idx1 > idx2) {
    const int tmp = idx2;
    idx2 = idx1;
    idx1 = tmp;
  }
-  pair.idx1 = idx1;
-  pair.idx2 = idx2;
+  pair->idx1 = idx1;
+  pair->idx2 = idx2;
  h1 = histograms[idx1];
  h2 = histograms[idx2];
  sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair.cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair.cost_combo);
-  pair.cost_diff = pair.cost_combo - sum_cost;
-
-  // Do not even consider the pair if it does not improve the entropy.
-  if (pair.cost_diff >= threshold) return 0.;
-
-  // We cannot add more elements than the capacity.
-  assert(histo_queue->size < histo_queue->max_size);
-  histo_queue->queue[histo_queue->size++] = pair;
-  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
-
-  return pair.cost_diff;
+  pair->cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo);
+  pair->cost_diff = pair->cost_combo - sum_cost;
 }

-// -----------------------------------------------------------------------------
-
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
 static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
@@ -740,11 +714,13 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
    clusters[i] = i;
    for (j = i + 1; j < image_histo_size; ++j) {
      // Initialize positions array.
-      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
+      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]);
+      UpdateQueueFront(&histo_queue);
    }
  }

  while (image_histo_size > 1 && histo_queue.size > 0) {
+    HistogramPair* copy_to;
    const int idx1 = histo_queue.queue[0].idx1;
    const int idx2 = histo_queue.queue[0].idx2;
    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
@@ -757,22 +733,31 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
    }
    --image_histo_size;

-    // Remove pairs intersecting the just combined best pair.
-    for (i = 0; i < histo_queue.size;) {
+    // Remove pairs intersecting the just combined best pair. This will
+    // therefore pop the head of the queue.
+    copy_to = histo_queue.queue;
+    for (i = 0; i < histo_queue.size; ++i) {
      HistogramPair* const p = histo_queue.queue + i;
      if (p->idx1 == idx1 || p->idx2 == idx1 ||
          p->idx1 == idx2 || p->idx2 == idx2) {
-        HistoQueuePopPair(&histo_queue, p);
-      } else {
-        HistoQueueUpdateHead(&histo_queue, p);
-        ++i;
+        // Do not copy the invalid pair.
+        continue;
      }
+      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
+        // Replace the top of the queue if we found better.
+        SwapHistogramPairs(histo_queue.queue, p);
+      }
+      SwapHistogramPairs(copy_to, p);
+      ++copy_to;
    }
+    histo_queue.size = (int)(copy_to - histo_queue.queue);

    // Push new pairs formed with combined histogram to the queue.
    for (i = 0; i < image_histo_size; ++i) {
      if (clusters[i] != idx1) {
-        HistoQueuePush(&histo_queue, histograms, idx1, clusters[i], 0.);
+        PreparePair(histograms, idx1, clusters[i],
+                    &histo_queue.queue[histo_queue.size]);
+        UpdateQueueFront(&histo_queue);
      }
    }
  }
@@ -792,130 +777,90 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
  return ok;
 }

-// Perform histogram aggregation using a stochastic approach.
-// 'do_greedy' is set to 1 if a greedy approach needs to be performed
-// afterwards, 0 otherwise.
-static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
-                                      int min_cluster_size,
-                                      int* const do_greedy) {
+static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* tmp_histo,
+                                       VP8LHistogram* best_combo,
+                                       int quality, int min_cluster_size) {
  int iter;
-  uint32_t seed = 1;
+  uint32_t seed = 0;
  int tries_with_no_success = 0;
  int image_histo_size = image_histo->size;
-  const int outer_iters = image_histo_size;
+  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
+  const int outer_iters = image_histo_size * iter_mult;
+  const int num_pairs = image_histo_size / 2;
  const int num_tries_no_success = outer_iters / 2;
+  int idx2_max = image_histo_size - 1;
+  int do_brute_dorce = 0;
  VP8LHistogram** const histograms = image_histo->histograms;
-  // Priority queue of histogram pairs. Its size of "kCostHeapSizeSqrt"^2
-  // impacts the quality of the compression and the speed: the smaller the
-  // faster but the worse for the compression.
-  HistoQueue histo_queue;
-  const int kHistoQueueSizeSqrt = 3;
-  int ok = 0;

-  if (!HistoQueueInit(&histo_queue, kHistoQueueSizeSqrt)) {
-    goto End;
-  }
  // Collapse similar histograms in 'image_histo'.
  ++min_cluster_size;
-  for (iter = 0; iter < outer_iters && image_histo_size >= min_cluster_size &&
-                 ++tries_with_no_success < num_tries_no_success;
+  for (iter = 0;
+       iter < outer_iters && image_histo_size >= min_cluster_size;
       ++iter) {
-    double best_cost =
-        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
+    double best_cost_diff = 0.;
    int best_idx1 = -1, best_idx2 = 1;
    int j;
-    const uint32_t rand_range = (image_histo_size - 1) * image_histo_size;
-    // image_histo_size / 2 was chosen empirically. Less means faster but worse
-    // compression.
-    const int num_tries = image_histo_size / 2;
+    int num_tries =
+        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
+    // Use a brute force approach if:
+    // - stochastic has not worked for a while and
+    // - if the number of iterations for brute force is less than the number of
+    // iterations if we never find a match ever again stochastically (hence
+    // num_tries times the number of remaining outer iterations).
+    do_brute_dorce =
+        (tries_with_no_success > 10) &&
+        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
+    if (do_brute_dorce) num_tries = idx2_max;

+    seed += iter;
    for (j = 0; j < num_tries; ++j) {
-      double curr_cost;
-      // Choose two different histograms at random and try to combine them.
-      const uint32_t tmp = MyRand(&seed) % rand_range;
-      const uint32_t idx1 = tmp / (image_histo_size - 1);
-      uint32_t idx2 = tmp % (image_histo_size - 1);
-      if (idx2 >= idx1) ++idx2;
-
-      // Calculate cost reduction on combination.
-      curr_cost =
-          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
-      if (curr_cost < 0) {  // found a better pair?
-        best_cost = curr_cost;
-        // Empty the queue if we reached full capacity.
-        if (histo_queue.size == histo_queue.max_size) break;
-      }
-    }
-    if (histo_queue.size == 0) continue;
-
-    // Merge the two best histograms.
-    best_idx1 = histo_queue.queue[0].idx1;
-    best_idx2 = histo_queue.queue[0].idx2;
-    assert(best_idx1 < best_idx2);
-    HistogramAddEval(histograms[best_idx1], histograms[best_idx2],
-                     histograms[best_idx1], 0);
-    // Swap the best_idx2 histogram with the last one (which is now unused).
-    --image_histo_size;
-    if (best_idx2 != image_histo_size) {
-      HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
-    }
-    histograms[image_histo_size] = NULL;
-    // Parse the queue and update each pair that deals with best_idx1,
-    // best_idx2 or image_histo_size.
-    for (j = 0; j < histo_queue.size;) {
-      HistogramPair* const p = histo_queue.queue + j;
-      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
-      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
-      int do_eval = 0;
-      // The front pair could have been duplicated by a random pick so
-      // check for it all the time nevertheless.
-      if (is_idx1_best && is_idx2_best) {
-        HistoQueuePopPair(&histo_queue, p);
-        continue;
-      }
-      // Any pair containing one of the two best indices should only refer to
-      // best_idx1. Its cost should also be updated.
-      if (is_idx1_best) {
-        p->idx1 = best_idx1;
-        do_eval = 1;
-      } else if (is_idx2_best) {
-        p->idx2 = best_idx1;
-        do_eval = 1;
-      }
-      if (p->idx2 == image_histo_size) {
-        // No need to re-evaluate here as it does not involve a pair
-        // containing best_idx1 or best_idx2.
-        p->idx2 = best_idx2;
-      }
-      assert(p->idx2 < image_histo_size);
-      // Make sure the index order is respected.
-      if (p->idx1 > p->idx2) {
-        const int tmp = p->idx2;
-        p->idx2 = p->idx1;
-        p->idx1 = tmp;
-      }
-      if (do_eval) {
-        // Re-evaluate the cost of an updated pair.
-        GetCombinedHistogramEntropy(histograms[p->idx1], histograms[p->idx2], 0,
-                                    &p->cost_diff);
-        if (p->cost_diff >= 0.) {
-          HistoQueuePopPair(&histo_queue, p);
+      double curr_cost_diff;
+      // Choose two histograms at random and try to combine them.
+      uint32_t idx1, idx2;
+      if (do_brute_dorce) {
+        // Use a brute force approach.
+        idx1 = (uint32_t)j;
+        idx2 = (uint32_t)idx2_max;
+      } else {
+        const uint32_t tmp = (j & 7) + 1;
+        const uint32_t diff =
+            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+        idx1 = MyRand(&seed) % image_histo_size;
+        idx2 = (idx1 + diff + 1) % image_histo_size;
+        if (idx1 == idx2) {
          continue;
        }
      }
-      HistoQueueUpdateHead(&histo_queue, p);
-      ++j;
-    }

-    tries_with_no_success = 0;
+      // Calculate cost reduction on combining.
+      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
+                                        tmp_histo, best_cost_diff);
+      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
+        HistogramSwap(&best_combo, &tmp_histo);
+        best_cost_diff = curr_cost_diff;
+        best_idx1 = idx1;
+        best_idx2 = idx2;
+      }
+    }
+    if (do_brute_dorce) --idx2_max;
+
+    if (best_idx1 >= 0) {
+      HistogramSwap(&best_combo, &histograms[best_idx1]);
+      // swap best_idx2 slot with last one (which is now unused)
+      --image_histo_size;
+      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
+      if (best_idx2 != image_histo_size) {
+        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
+        histograms[image_histo_size] = NULL;
+      }
+      tries_with_no_success = 0;
+    }
+    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
+      break;
+    }
  }
  image_histo->size = image_histo_size;
-  *do_greedy = (image_histo->size <= min_cluster_size);
-  ok = 1;
-
-End:
-  HistoQueueClear(&histo_queue);
-  return ok;
 }

 // -----------------------------------------------------------------------------
@@ -980,7 +925,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             int quality, int low_effort,
                             int histo_bits, int cache_bits,
                             VP8LHistogramSet* const image_histo,
-                             VP8LHistogram* const tmp_histo,
+                             VP8LHistogramSet* const tmp_histos,
                             uint16_t* const histogram_symbols) {
  int ok = 0;
  const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
@@ -988,6 +933,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  const int image_histo_raw_size = histo_xsize * histo_ysize;
  VP8LHistogramSet* const orig_histo =
      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+  VP8LHistogram* cur_combo;
  // Don't attempt linear bin-partition heuristic for
  // histograms of small sizes (as bin_map will be very sparse) and
  // maximum quality q==100 (to preserve the compression gains at that level).
@@ -1002,6 +948,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  // Copies the histograms and computes its bit_cost.
  HistogramCopyAndAnalyze(orig_histo, image_histo);

+  cur_combo = tmp_histos->histograms[1];  // pick up working slot
  if (entropy_combine) {
    const int bin_map_size = orig_histo->size;
    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
@@ -1011,9 +958,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,

    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
    // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
-                               entropy_combine_num_bins, combine_cost_factor,
-                               low_effort);
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
+                                           bin_map, bin_map_size,
+                                           entropy_combine_num_bins,
+                                           combine_cost_factor, low_effort);
  }

  // Don't combine the histograms using stochastic and greedy heuristics for
@@ -1022,11 +970,10 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
    const float x = quality / 100.f;
    // cubic ramp between 1 and MAX_HISTO_GREEDY:
    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
-    int do_greedy;
-    if (!HistogramCombineStochastic(image_histo, threshold_size, &do_greedy)) {
-      goto Error;
-    }
-    if (do_greedy && !HistogramCombineGreedy(image_histo)) {
+    HistogramCombineStochastic(image_histo, tmp_histos->histograms[0],
+                               cur_combo, quality, threshold_size);
+    if ((image_histo->size <= threshold_size) &&
+        !HistogramCombineGreedy(image_histo)) {
      goto Error;
    }
  }
--- a/src/enc/histogram_enc.h
+++ b/src/enc/histogram_enc.h
@@ -90,9 +90,7 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);

 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v,
-                                     int (*const distance_modifier)(int, int),
-                                     int distance_modifier_arg0);
+                                     const PixOrCopy* const v);

 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
  return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
@@ -105,7 +103,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             int quality, int low_effort,
                             int histogram_bits, int cache_bits,
                             VP8LHistogramSet* const image_in,
-                             VP8LHistogram* const tmp_histo,
+                             VP8LHistogramSet* const tmp_histos,
                             uint16_t* const histogram_symbols);

 // Returns the entropy for the symbols in the input array.
--- a/src/enc/near_lossless_enc.c
+++ b/src/enc/near_lossless_enc.c
@@ -26,9 +26,9 @@

 // Quantizes the value up or down to a multiple of 1<<bits (or to 255),
 // choosing the closer one, resolving ties using bankers' rounding.
-static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
-  const uint32_t mask = (1u << bits) - 1;
-  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
+static int FindClosestDiscretized(int a, int bits) {
+  const int mask = (1 << bits) - 1;
+  const int biased = a + (mask >> 1) + ((a >> bits) & 1);
  assert(bits > 0);
  if (biased > 0xff) return 0xff;
  return biased & ~mask;
@@ -69,30 +69,22 @@ static int IsSmooth(const uint32_t* const prev_row,
 }

 // Adjusts pixel values of image with given maximum error.
-static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
-                         int stride, int limit_bits, uint32_t* copy_buffer,
-                         uint32_t* argb_dst) {
+static void NearLossless(int xsize, int ysize, uint32_t* argb,
+                         int limit_bits, uint32_t* copy_buffer) {
  int x, y;
  const int limit = 1 << limit_bits;
  uint32_t* prev_row = copy_buffer;
  uint32_t* curr_row = prev_row + xsize;
  uint32_t* next_row = curr_row + xsize;
-  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
-  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));

-  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
-    if (y == 0 || y == ysize - 1) {
-      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
-    } else {
-      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
-      argb_dst[0] = argb_src[0];
-      argb_dst[xsize - 1] = argb_src[xsize - 1];
-      for (x = 1; x < xsize - 1; ++x) {
-        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
-          argb_dst[x] = curr_row[x];
-        } else {
-          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
-        }
+  for (y = 1; y < ysize - 1; ++y) {
+    uint32_t* const curr_argb_row = argb + y * xsize;
+    uint32_t* const next_argb_row = curr_argb_row + xsize;
+    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
+    for (x = 1; x < xsize - 1; ++x) {
+      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
      }
    }
    {
@@ -105,37 +97,25 @@ static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
  }
 }

-int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
-                         uint32_t* const argb_dst) {
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
  int i;
-  const int xsize = picture->width;
-  const int ysize = picture->height;
-  const int stride = picture->argb_stride;
  uint32_t* const copy_buffer =
      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
  const int limit_bits = VP8LNearLosslessBits(quality);
-  assert(argb_dst != NULL);
-  assert(limit_bits > 0);
+  assert(argb != NULL);
+  assert(limit_bits >= 0);
  assert(limit_bits <= MAX_LIMIT_BITS);
  if (copy_buffer == NULL) {
    return 0;
  }
  // For small icon images, don't attempt to apply near-lossless compression.
-  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
-       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
-      ysize < 3) {
-    for (i = 0; i < ysize; ++i) {
-      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
-             xsize * sizeof(*argb_dst));
-    }
+  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
    WebPSafeFree(copy_buffer);
    return 1;
  }

-  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
-               argb_dst);
-  for (i = limit_bits - 1; i != 0; --i) {
-    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
+  for (i = limit_bits; i != 0; --i) {
+    NearLossless(xsize, ysize, argb, i, copy_buffer);
  }
  WebPSafeFree(copy_buffer);
  return 1;
--- a/src/enc/picture_csp_enc.c
+++ b/src/enc/picture_csp_enc.c
@@ -171,7 +171,7 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)

 // float variant of gamma-correction
-// We use tables of different size and precision for the Rec709 / BT2020
+// We use tables of different size and precision for the Rec709
 // transfer function.
 #define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
@@ -183,8 +183,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
    int v;
    const double norm = 1. / MAX_Y_T;
    const double scale = 1. / kGammaTabSize;
-    const double a = 0.09929682680944;
-    const double thresh = 0.018053968510807;
+    const double a = 0.099;
+    const double thresh = 0.018;
    for (v = 0; v <= MAX_Y_T; ++v) {
      const double g = norm * v;
      if (g <= thresh * 4.5) {
@@ -1105,14 +1105,9 @@ static int Import(WebPPicture* const picture,

  if (import_alpha) {
    uint32_t* dst = picture->argb;
-    const int do_copy = !swap_rb && !ALPHA_IS_LAST;
    assert(step == 4);
    for (y = 0; y < height; ++y) {
-      if (do_copy) {
-        memcpy(dst, r_ptr, width * sizeof(*dst));
-      } else {
-        VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
-      }
+      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
      a_ptr += rgb_stride;
      r_ptr += rgb_stride;
      g_ptr += rgb_stride;
--- a/src/enc/predictor_enc.c
+++ b/src/enc/predictor_enc.c
@@ -180,7 +180,6 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
 // max_quantization which is a power of 2, smaller than max_diff). Take care if
 // value and predict have undergone subtract green, which means that red and
 // blue are represented as offsets from green.
-#define NEAR_LOSSLESS_DIFF(a, b) (uint8_t)((((int)(a) - (int)(b))) & 0xff)
 static uint32_t NearLossless(uint32_t value, uint32_t predict,
                             int max_quantization, int max_diff,
                             int used_subtract_green) {
@@ -197,7 +196,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
  }
  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
    // Preserve transparency of fully transparent or fully opaque pixels.
-    a = NEAR_LOSSLESS_DIFF(value >> 24, predict >> 24);
+    a = ((value >> 24) - (predict >> 24)) & 0xff;
  } else {
    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
  }
@@ -210,16 +209,15 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
    // The amount by which green has been adjusted during quantization. It is
    // subtracted from red and blue for compensation, to avoid accumulating two
    // quantization errors in them.
-    green_diff = NEAR_LOSSLESS_DIFF(new_green, value >> 8);
+    green_diff = (new_green - (value >> 8)) & 0xff;
  }
-  r = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value >> 16, green_diff),
+  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
                            (predict >> 16) & 0xff, 0xff - new_green,
                            quantization);
-  b = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value, green_diff),
-                            predict & 0xff, 0xff - new_green, quantization);
+  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
+                            0xff - new_green, quantization);
  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
 }
-#undef NEAR_LOSSLESS_DIFF

 // Stores the difference between the pixel and its prediction in "out".
 // In case of a lossy encoding, updates the source image to avoid propagating
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@@ -504,8 +504,7 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);

  // in near_lossless.c
 // Near lossless preprocessing in RGB color-space.
-int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
-                         uint32_t* const argb_dst);
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
 // Near lossless adjustment for predictors.
 void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
                                 const uint32_t* argb_orig,
--- a/src/enc/vp8l_enc.c
+++ b/src/enc/vp8l_enc.c
--- a/src/enc/vp8li_enc.h
+++ b/src/enc/vp8li_enc.h
@@ -27,24 +27,16 @@ extern "C" {
 // maximum value of transform_bits_ in VP8LEncoder.
 #define MAX_TRANSFORM_BITS 6

-typedef enum {
-  kEncoderNone = 0,
-  kEncoderARGB,
-  kEncoderNearLossless,
-  kEncoderPalette
-} VP8LEncoderARGBContent;
-
 typedef struct {
  const WebPConfig* config_;      // user configuration and parameters
  const WebPPicture* pic_;        // input picture.

-  uint32_t* argb_;                       // Transformed argb image data.
-  VP8LEncoderARGBContent argb_content_;  // Content type of the argb buffer.
-  uint32_t* argb_scratch_;               // Scratch memory for argb rows
-                                         // (used for prediction).
-  uint32_t* transform_data_;             // Scratch memory for transform data.
-  uint32_t* transform_mem_;              // Currently allocated memory.
-  size_t    transform_mem_size_;         // Currently allocated memory size.
+  uint32_t* argb_;                // Transformed argb image data.
+  uint32_t* argb_scratch_;        // Scratch memory for argb rows
+                                  // (used for prediction).
+  uint32_t* transform_data_;      // Scratch memory for transform data.
+  uint32_t* transform_mem_;       // Currently allocated memory.
+  size_t    transform_mem_size_;  // Currently allocated memory size.

  int       current_width_;       // Corresponds to packed image width.

@@ -62,7 +54,8 @@ typedef struct {
  uint32_t palette_[MAX_PALETTE_SIZE];

  // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
+                                     // LZ77 & RLE coding.
  VP8LHashChain hash_chain_;         // HashChain data for constructing
                                     // backward references.
 } VP8LEncoder;
--- a/src/mux/anim_encode.c
+++ b/src/mux/anim_encode.c
@@ -35,7 +35,7 @@
 // Stores frame rectangle dimensions.
 typedef struct {
  int x_offset_, y_offset_, width_, height_;
-} FrameRectangle;
+} FrameRect;

 // Used to store two candidates of encoded data for an animation frame. One of
 // the two will be chosen later.
@@ -50,7 +50,7 @@ struct WebPAnimEncoder {
  const int canvas_height_;                 // Canvas height.
  const WebPAnimEncoderOptions options_;    // Global encoding options.

-  FrameRectangle prev_rect_;          // Previous WebP frame rectangle.
+  FrameRect prev_rect_;               // Previous WebP frame rectangle.
  WebPConfig last_config_;            // Cached in case a re-encode is needed.
  WebPConfig last_config_reversed_;   // If 'last_config_' uses lossless, then
                                      // this config uses lossy and vice versa;
@@ -206,7 +206,7 @@ static void ClearRectangle(WebPPicture* const picture,
 }

 static void WebPUtilClearPic(WebPPicture* const picture,
-                             const FrameRectangle* const rect) {
+                             const FrameRect* const rect) {
  if (rect != NULL) {
    ClearRectangle(picture, rect->x_offset_, rect->y_offset_,
                   rect->width_, rect->height_);
@@ -400,7 +400,7 @@ static WEBP_INLINE int ComparePixelsLossy(const uint32_t* src, int src_step,
  return 1;
 }

-static int IsEmptyRect(const FrameRectangle* const rect) {
+static int IsEmptyRect(const FrameRect* const rect) {
  return (rect->width_ == 0) || (rect->height_ == 0);
 }

@@ -413,7 +413,7 @@ static int QualityToMaxDiff(float quality) {
 // Assumes that an initial valid guess of change rectangle 'rect' is passed.
 static void MinimizeChangeRectangle(const WebPPicture* const src,
                                    const WebPPicture* const dst,
-                                    FrameRectangle* const rect,
+                                    FrameRect* const rect,
                                    int is_lossless, float quality) {
  int i, j;
  const ComparePixelsFunc compare_pixels =
@@ -498,7 +498,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
 }

 // Snap rectangle to even offsets (and adjust dimensions if needed).
-static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) {
+static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
  rect->width_ += (rect->x_offset_ & 1);
  rect->height_ += (rect->y_offset_ & 1);
  rect->x_offset_ &= ~1;
@@ -508,9 +508,9 @@ static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) {
 typedef struct {
  int should_try_;               // Should try this set of parameters.
  int empty_rect_allowed_;       // Frame with empty rectangle can be skipped.
-  FrameRectangle rect_ll_;       // Frame rectangle for lossless compression.
+  FrameRect rect_ll_;            // Frame rectangle for lossless compression.
  WebPPicture sub_frame_ll_;     // Sub-frame pic for lossless compression.
-  FrameRectangle rect_lossy_;    // Frame rectangle for lossy compression.
+  FrameRect rect_lossy_;         // Frame rectangle for lossy compression.
                                 // Could be smaller than rect_ll_ as pixels
                                 // with small diffs can be ignored.
  WebPPicture sub_frame_lossy_;  // Sub-frame pic for lossless compression.
@@ -538,8 +538,7 @@ static void SubFrameParamsFree(SubFrameParams* const params) {
 static int GetSubRect(const WebPPicture* const prev_canvas,
                      const WebPPicture* const curr_canvas, int is_key_frame,
                      int is_first_frame, int empty_rect_allowed,
-                      int is_lossless, float quality,
-                      FrameRectangle* const rect,
+                      int is_lossless, float quality, FrameRect* const rect,
                      WebPPicture* const sub_frame) {
  if (!is_key_frame || is_first_frame) {  // Optimize frame rectangle.
    // Note: This behaves as expected for first frame, as 'prev_canvas' is
@@ -595,7 +594,7 @@ int WebPAnimEncoderRefineRect(
    const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
    int is_lossless, float quality, int* const x_offset, int* const y_offset,
    int* const width, int* const height) {
-  FrameRectangle rect;
+  FrameRect rect;
  const int right = clip(*x_offset + *width, 0, curr_canvas->width);
  const int left = clip(*x_offset, 0, curr_canvas->width - 1);
  const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
@@ -621,7 +620,7 @@ int WebPAnimEncoderRefineRect(
 }

 static void DisposeFrameRectangle(int dispose_method,
-                                  const FrameRectangle* const rect,
+                                  const FrameRect* const rect,
                                  WebPPicture* const curr_canvas) {
  assert(rect != NULL);
  if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
@@ -629,13 +628,13 @@ static void DisposeFrameRectangle(int dispose_method,
  }
 }

-static uint32_t RectArea(const FrameRectangle* const rect) {
+static uint32_t RectArea(const FrameRect* const rect) {
  return (uint32_t)rect->width_ * rect->height_;
 }

 static int IsLosslessBlendingPossible(const WebPPicture* const src,
                                      const WebPPicture* const dst,
-                                      const FrameRectangle* const rect) {
+                                      const FrameRect* const rect) {
  int i, j;
  assert(src->width == dst->width && src->height == dst->height);
  assert(rect->x_offset_ + rect->width_ <= dst->width);
@@ -657,7 +656,7 @@ static int IsLosslessBlendingPossible(const WebPPicture* const src,

 static int IsLossyBlendingPossible(const WebPPicture* const src,
                                   const WebPPicture* const dst,
-                                   const FrameRectangle* const rect,
+                                   const FrameRect* const rect,
                                   float quality) {
  const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
  int i, j;
@@ -684,7 +683,7 @@ static int IsLossyBlendingPossible(const WebPPicture* const src,
 // transparent pixels.
 // Returns true if at least one pixel gets modified.
 static int IncreaseTransparency(const WebPPicture* const src,
-                                const FrameRectangle* const rect,
+                                const FrameRect* const rect,
                                WebPPicture* const dst) {
  int i, j;
  int modified = 0;
@@ -710,7 +709,7 @@ static int IncreaseTransparency(const WebPPicture* const src,
 // Assumes lossy compression is being used.
 // Returns true if at least one pixel gets modified.
 static int FlattenSimilarBlocks(const WebPPicture* const src,
-                                const FrameRectangle* const rect,
+                                const FrameRect* const rect,
                                WebPPicture* const dst, float quality) {
  const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
  int i, j;
@@ -779,13 +778,13 @@ static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
 typedef struct {
  WebPMemoryWriter  mem_;
  WebPMuxFrameInfo  info_;
-  FrameRectangle    rect_;
+  FrameRect         rect_;
  int               evaluate_;  // True if this candidate should be evaluated.
 } Candidate;

 // Generates a candidate encoded frame given a picture and metadata.
 static WebPEncodingError EncodeCandidate(WebPPicture* const sub_frame,
-                                         const FrameRectangle* const rect,
+                                         const FrameRect* const rect,
                                         const WebPConfig* const encoder_config,
                                         int use_blending,
                                         Candidate* const candidate) {
@@ -959,7 +958,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
  if (new_duration >= MAX_DURATION) {  // Special case.
    // Separate out previous frame from earlier merged frames to avoid overflow.
    // We add a 1x1 transparent frame for the previous frame, with blending on.
-    const FrameRectangle rect = { 0, 0, 1, 1 };
+    const FrameRect rect = { 0, 0, 1, 1 };
    const uint8_t lossless_1x1_bytes[] = {
      0x52, 0x49, 0x46, 0x46, 0x14, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
      0x56, 0x50, 0x38, 0x4c, 0x08, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
@@ -1224,7 +1223,7 @@ static int CacheFrame(WebPAnimEncoder* const enc,
      enc->prev_candidate_undecided_ = 0;
    } else {
      int64_t curr_delta;
-      FrameRectangle prev_rect_key, prev_rect_sub;
+      FrameRect prev_rect_key, prev_rect_sub;

      // Add this as a frame rectangle to enc.
      error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
--- a/src/mux/muxinternal.c
+++ b/src/mux/muxinternal.c
@@ -504,20 +504,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
    if (!has_animation && (num_anim == 1 || num_frames > 0)) {
      return WEBP_MUX_INVALID_ARGUMENT;
    }
-    if (!has_animation) {
-      const WebPMuxImage* images = mux->images_;
-      // There can be only one image.
-      if (images == NULL || images->next_ != NULL) {
-        return WEBP_MUX_INVALID_ARGUMENT;
-      }
-      // Size must match.
-      if (mux->canvas_width_ > 0) {
-        if (images->width_ != mux->canvas_width_ ||
-            images->height_ != mux->canvas_height_) {
-          return WEBP_MUX_INVALID_ARGUMENT;
-        }
-      }
-    }
  }

  // Verify either VP8X chunk is present OR there is only one elem in
@@ -529,7 +515,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
  if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;

  // ALPHA_FLAG & alpha chunk(s) are consistent.
-  // Note: ALPHA_FLAG can be set when there is actually no Alpha data present.
  if (MuxHasAlpha(mux->images_)) {
    if (num_vp8x > 0) {
      // VP8X chunk is present, so it should contain ALPHA_FLAG.
@@ -540,6 +525,8 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
      if (err != WEBP_MUX_OK) return err;
      if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
    }
+  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
+    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
  }

  return WEBP_MUX_OK;
--- a/src/mux/muxread.c
+++ b/src/mux/muxread.c
@@ -270,9 +270,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
    ChunkInit(&chunk);
  }

-  // Incomplete image.
-  if (wpi->is_partial_) goto Err;
-
  // Validate mux if complete.
  if (MuxValidate(mux) != WEBP_MUX_OK) goto Err;

--- a/src/utils/bit_reader_utils.h
+++ b/src/utils/bit_reader_utils.h
@@ -155,10 +155,9 @@ static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {

 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
-// This function does *not* set br->eos_, since it's speed-critical.
-// Use with extreme care!
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
  br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
 }

 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
--- a/src/utils/bit_writer_utils.c
+++ b/src/utils/bit_writer_utils.c
@@ -239,18 +239,6 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
  return VP8LBitWriterResize(bw, expected_size);
 }

-int VP8LBitWriterClone(const VP8LBitWriter* const src,
-                       VP8LBitWriter* const dst) {
-  const size_t current_size = src->cur_ - src->buf_;
-  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
-  if (!VP8LBitWriterResize(dst, current_size)) return 0;
-  memcpy(dst->buf_, src->buf_, current_size);
-  dst->bits_ = src->bits_;
-  dst->used_ = src->used_;
-  dst->error_ = src->error_;
-  return 1;
-}
-
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
  if (bw != NULL) {
    WebPSafeFree(bw->buf_);
@@ -258,21 +246,6 @@ void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
  }
 }

-void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
-                        VP8LBitWriter* const bw) {
-  bw->bits_ = bw_init->bits_;
-  bw->used_ = bw_init->used_;
-  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
-  assert(bw->cur_ <= bw->end_);
-  bw->error_ = bw_init->error_;
-}
-
-void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
-  const VP8LBitWriter tmp = *src;
-  *src = *dst;
-  *dst = tmp;
-}
-
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
  // If needed, make some room by flushing some bits out.
  if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
--- a/src/utils/bit_writer_utils.h
+++ b/src/utils/bit_writer_utils.h
@@ -100,24 +100,16 @@ typedef struct {
  int error_;
 } VP8LBitWriter;

-static WEBP_INLINE size_t VP8LBitWriterNumBytes(const VP8LBitWriter* const bw) {
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
  return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }

 // Returns false in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
-// Returns false in case of memory allocation error.
-int VP8LBitWriterClone(const VP8LBitWriter* const src,
-                       VP8LBitWriter* const dst);
 // Finalize the bitstream coding. Returns a pointer to the internal buffer.
 uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
 // Release any pending memory and zeroes the object.
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw);
-// Resets the cursor of the BitWriter bw to when it was like in bw_init.
-void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
-                        VP8LBitWriter* const bw);
-// Swaps the memory held by two BitWriters.
-void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst);

 // Internal function for VP8LPutBits flushing 32 bits from the written state.
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw);
--- a/src/utils/color_cache_utils.h
+++ b/src/utils/color_cache_utils.h
@@ -15,8 +15,6 @@
 #ifndef WEBP_UTILS_COLOR_CACHE_H_
 #define WEBP_UTILS_COLOR_CACHE_H_

-#include <assert.h>
-
 #include "../webp/types.h"

 #ifdef __cplusplus
@@ -32,7 +30,7 @@ typedef struct {

 static const uint64_t kHashMul = 0x1e35a7bdull;

-static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
+static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
 }

@@ -50,19 +48,19 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,

 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                             uint32_t argb) {
-  const int key = VP8LHashPix(argb, cc->hash_shift_);
+  const int key = HashPix(argb, cc->hash_shift_);
  cc->colors_[key] = argb;
 }

 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  return VP8LHashPix(argb, cc->hash_shift_);
+  return HashPix(argb, cc->hash_shift_);
 }

 // Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const int key = VP8LHashPix(argb, cc->hash_shift_);
+  const int key = HashPix(argb, cc->hash_shift_);
  return (cc->colors_[key] == argb) ? key : -1;
 }

--- a/src/utils/quant_levels_dec_utils.c
+++ b/src/utils/quant_levels_dec_utils.c
@@ -71,11 +71,10 @@ typedef struct {

 //------------------------------------------------------------------------------

-#define CLIP_8b_MASK (int)(~0U << (8 + DFIX))
+#define CLIP_MASK (int)(~0U << (8 + DFIX))
 static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & CLIP_8b_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
 }
-#undef CLIP_8b_MASK

 // vertical accumulation
 static void VFilter(SmoothParams* const p) {
--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@@ -16,7 +16,6 @@
 #include "../webp/decode.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
-#include "./color_cache_utils.h"
 #include "./utils.h"

 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
@@ -253,6 +252,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
  int num_colors = 0;
  uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
  uint32_t colors[COLOR_HASH_SIZE];
+  static const uint64_t kHashMul = 0x1e35a7bdull;
  const uint32_t* argb = pic->argb;
  const int width = pic->width;
  const int height = pic->height;
@@ -267,7 +267,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
        continue;
      }
      last_pix = argb[x];
-      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
+      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
      while (1) {
        if (!in_use[key]) {
          colors[key] = last_pix;
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -66,7 +66,7 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
  uint32_t A;
-  memcpy(&A, ptr, sizeof(A));
+  memcpy(&A, (const int*)ptr, sizeof(A));
  return A;
 }
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
@@ -112,12 +112,12 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
 #define WEBP_NEED_LOG_TABLE_8BIT
 extern const uint8_t WebPLogTable8bit[256];
 static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
-  int log_value = 0;
+  int log = 0;
  while (n >= 256) {
-    log_value += 8;
+    log += 8;
    n >>= 8;
  }
-  return log_value + WebPLogTable8bit[n];
+  return log + WebPLogTable8bit[n];
 }

 // Returns (int)floor(log2(n)). n must be > 0.
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@@ -93,11 +93,7 @@ typedef enum WebPImageHint {
 // Compression parameters.
 struct WebPConfig {
  int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
-  float quality;          // between 0 and 100. For lossy, 0 gives the smallest
-                          // size and 100 the largest. For lossless, this
-                          // parameter is the amount of effort put into the
-                          // compression: 0 is the fastest but gives larger
-                          // files compared to the slowest, but best, 100.
+  float quality;          // between 0 (smallest file) and 100 (biggest)
  int method;             // quality/speed trade-off (0=fast, 6=slower-better)

  WebPImageHint image_hint;  // Hint for image type (lossless only for now).
--- a/webp_js/index.html
+++ b/webp_js/index.html
@@ -1,72 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-
-<head>
-  <meta charset="UTF-8">
-  <title>simple Javascript WebP decoding demo</title>
-  <script type="text/javascript">
-    var Module = {
-      noInitialRun : true
-    };
-  </script>
-  <script type="text/javascript" src="./webp.js"></script>
-  <script type="text/javascript">
-
-// main wrapper for the function decoding a WebP into a canvas object
-var WebpToCanvas;
-
-function init() {
-  WebpToCanvas = Module.cwrap('WebpToSDL', 'number', ['array', 'number']);
-}
-
-function decode(webp_data, canvas_id) {
-  // get the canvas to decode into
-  var canvas = document.getElementById(canvas_id);
-  if (canvas == null) return;
-  // clear previous picture (if any)
-  Module.canvas = canvas;
-  canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
-  // decode and measure timing
-  start = new Date();
-  var ret = WebpToCanvas(webp_data, webp_data.length);
-  end = new Date();
-  speed_result = document.getElementById('timing');
-  // display timing result
-  if (speed_result != null) {
-    var decode_time = end - start;
-    speed_result.innerHTML = '<p>decoding time: ' + decode_time +' ms.</p>';
-  }
-}
-
-function loadfile(filename, canvas_id) {
-  var xhr = new XMLHttpRequest();
-  xhr.open('GET', filename);
-  xhr.responseType = 'arraybuffer';
-  xhr.onreadystatechange = function() {
-    if (xhr.readyState == 4 && xhr.status == 200) {
-      var webp_data = new Uint8Array(xhr.response);
-      decode(webp_data, canvas_id);
-    }
-  };
-  xhr.send();
-}
-  </script>
-</head>
-
-<body onload='init()'>
-  <p>
-    <strong>WebP in JavaScript demo</strong> -
-  </p>
-  <p>
-    WebP decoder in JavaScript, using libwebp compiled with
-    <a href="https://github.com/kripken/emscripten/wiki">Emscripten</a>.
-  </p>
-  <p id="image_buttons">
-    <input type="button" value="test image!" name="./test_webp_js.webp"
-           onclick="loadfile(this.name, 'output_canvas')">
-  </p>
-  <p id="timing">Timing: N/A</p>
-  <canvas id="output_canvas">Your browser does not support canvas</canvas>
-
-</body>
-</html>
--- a/webp_js/index_wasm.html
+++ b/webp_js/index_wasm.html
@@ -1,84 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-
-<head>
-  <meta charset="UTF-8">
-  <title>simple Javascript WebP decoding demo, using Web-Assembly (WASM)</title>
-  <script type="text/javascript">
-    var Module = {
-      noInitialRun : true
-    };
-  </script>
-  <script type="text/javascript">
-
-function init() {
-  var xhr = new XMLHttpRequest();
-  xhr.open('GET', 'webp_wasm.wasm', true);
-  xhr.responseType = 'arraybuffer';
-  xhr.onload = function() {
-    Module.wasmBinary = xhr.response;
-    var script = document.createElement('script');
-    script.src = "webp_wasm.js";
-    document.body.appendChild(script);
-  };
-  xhr.send(null);
-}
-
-function decode(webp_data, canvas_id) {
-  var result;
-  if (Module["asm"] != undefined) {
-    // wrapper for the function decoding a WebP into a canvas object
-    WebpToCanvas = Module.cwrap('WebpToSDL', 'number', ['array', 'number']);
-    // get the canvas to decode into
-    var canvas = document.getElementById(canvas_id);
-    if (canvas == null) return;
-    // clear previous picture (if any)
-    Module.canvas = canvas;
-    canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height);
-    // decode and measure timing
-    start = new Date();
-    var ret = WebpToCanvas(webp_data, webp_data.length);
-    end = new Date();
-    var decode_time = end - start;
-    result = 'decoding time: ' + decode_time +' ms.';
-  } else {
-    result = "WASM module not finished loading! Please retry";
-  }
-  // display timing result
-  speed_result = document.getElementById('timing');
-  if (speed_result != null) {
-    speed_result.innerHTML = '<p>'+ result + '</p>';
-  }
-}
-
-function loadfile(filename, canvas_id) {
-  var xhr = new XMLHttpRequest();
-  xhr.open('GET', filename);
-  xhr.responseType = 'arraybuffer';
-  xhr.onreadystatechange = function() {
-    if (xhr.readyState == 4 && xhr.status == 200) {
-      var webp_data = new Uint8Array(xhr.response);
-      decode(webp_data, canvas_id);
-    }
-  };
-  xhr.send();
-}
-  </script>
-</head>
-
-<body onload='init()'>
-  <p>
-    <strong>WebP demo using Web-Assembly</strong> -
-  </p>
-  <p>
-    WASM version of the WebP decoder, using libwebp compiled with
-    <a href="https://github.com/kripken/emscripten/wiki">Emscripten</a>.
-  </p>
-  <p id="image_buttons">
-    <input type="button" value="test image!"
-           onclick="loadfile('./test_webp_wasm.webp', 'output_canvas')">
-  </p>
-  <p id="timing">Timing: N/A</p>
-  <canvas id="output_canvas">Your browser does not support canvas</canvas>
-</body>
-</html>
--- a/webp_js/test_webp_js.webp
+++ b/webp_js/test_webp_js.webp
--- a/webp_js/test_webp_wasm.webp
+++ b/webp_js/test_webp_wasm.webp