cwebp,get_disto: fix bpp output

bits-per-pixel were intended, not bytes-per-pixel Change-Id: I023349013ac5956154ab4526bd1e195dfe95b8ab
Fix CMake with WASM.
2018-04-10 15:51:23 -07:00 · 2018-04-05 16:25:15 +02:00 · 2018-04-05 16:25:15 +02:00 · 2018-04-04 01:35:03 +00:00 · 2018-04-03 17:49:08 -07:00 · 2018-04-03 14:10:50 +00:00
67 changed files with 1981 additions and 1007 deletions
--- a/Android.mk
+++ b/Android.mk
@ -85,11 +85,13 @@ dsp_dec_srcs := \
    src/dsp/upsampling_msa.c \
    src/dsp/upsampling_neon.$(NEON) \
    src/dsp/upsampling_sse2.c \
+    src/dsp/upsampling_sse41.c \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_mips_dsp_r2.c \
    src/dsp/yuv_neon.$(NEON) \
    src/dsp/yuv_sse2.c \
+    src/dsp/yuv_sse41.c \

 dsp_enc_srcs := \
    src/dsp/cost.c \
@ -121,7 +123,6 @@ enc_srcs := \
    src/enc/backward_references_enc.c \
    src/enc/config_enc.c \
    src/enc/cost_enc.c \
-    src/enc/delta_palettization_enc.c \
    src/enc/filter_enc.c \
    src/enc/frame_enc.c \
    src/enc/histogram_enc.c \
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,17 +1,16 @@
-cmake_minimum_required(VERSION 2.8.7)
+cmake_minimum_required(VERSION 3.5)

-project(libwebp C)
+project(WebP C)

 # Options for coder / decoder executables.
 option(WEBP_ENABLE_SIMD "Enable any SIMD optimization." ON)
-option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." OFF)
-option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." OFF)
-option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." OFF)
-option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." OFF)
-option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." OFF)
+option(WEBP_BUILD_CWEBP "Build the cwebp command line tool." ON)
+option(WEBP_BUILD_DWEBP "Build the dwebp command line tool." ON)
+option(WEBP_BUILD_GIF2WEBP "Build the gif2webp conversion tool." ON)
+option(WEBP_BUILD_IMG2WEBP "Build the img2webp animation tool." ON)
+option(WEBP_BUILD_WEBPINFO "Build the webpinfo command line tool." ON)
 option(WEBP_BUILD_WEBP_JS "Emscripten build of webp.js." OFF)
-option(WEBP_ENABLE_NEAR_LOSSLESS "Enable near-lossless encoding" ON)
-option(WEBP_EXPERIMENTAL_FEATURES "Build with experimental features." OFF)
+option(WEBP_NEAR_LOSSLESS "Enable near-lossless encoding" ON)
 option(WEBP_ENABLE_SWAP_16BIT_CSP "Enable byte swap for 16 bit colorspaces." OFF)

 if(WEBP_BUILD_WEBP_JS)
@ -23,12 +22,13 @@ set(WEBP_DEP_INCLUDE_DIRS)

 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release" CACHE
-    "Build type: Release, Debug or RelWithDebInfo" STRING FORCE
+    "Build type: Release, Debug, MinSizeRel or RelWithDebInfo" STRING FORCE
  )
 endif()

 # Include dependencies.
 include(cmake/deps.cmake)
+include(GNUInstallDirs)

 ################################################################################
 # Options.
@ -101,6 +101,11 @@ foreach(FILE ${WEBP_SIMD_FILES_NOT_TO_INCLUDE})
  list(REMOVE_ITEM WEBP_DSP_DEC_SRCS ${FILE})
 endforeach()

+# Generate the config.h file.
+configure_file(${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in
+  ${CMAKE_CURRENT_BINARY_DIR}/src/webp/config.h)
+add_definitions(-DHAVE_CONFIG_H)
+
 ### Define the mandatory libraries.
 # Build the webpdecoder library.
 if(MSVC)
@ -109,24 +114,63 @@ if(MSVC)
 else()
  add_definitions(-Wall)
 endif()
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${WEBP_DEP_INCLUDE_DIRS})
+include_directories(${WEBP_DEP_INCLUDE_DIRS})
 add_library(webpdecode OBJECT ${WEBP_DEC_SRCS})
+target_include_directories(webpdecode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                              ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webpdspdecode OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS})
+target_include_directories(webpdspdecode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                                 ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webputilsdecode OBJECT ${WEBP_UTILS_COMMON_SRCS}
-  ${WEBP_UTILS_DEC_SRCS})
+                                   ${WEBP_UTILS_DEC_SRCS}
+)
+target_include_directories(webputilsdecode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                                   ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webpdecoder $<TARGET_OBJECTS:webpdecode>
  $<TARGET_OBJECTS:webpdspdecode> $<TARGET_OBJECTS:webputilsdecode>)
 target_link_libraries(webpdecoder ${WEBP_DEP_LIBRARIES})
+target_include_directories(webpdecoder
+  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+          ${CMAKE_CURRENT_SOURCE_DIR}
+  INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+set_target_properties(webpdecoder PROPERTIES PUBLIC_HEADER
+"${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h"
+)

 # Build the webp library.
 add_library(webpencode OBJECT ${WEBP_ENC_SRCS})
+target_include_directories(webpencode PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                              ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webpdsp OBJECT ${WEBP_DSP_COMMON_SRCS} ${WEBP_DSP_DEC_SRCS}
  ${WEBP_DSP_ENC_SRCS})
+target_include_directories(webpdsp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                           ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webputils OBJECT ${WEBP_UTILS_COMMON_SRCS} ${WEBP_UTILS_DEC_SRCS}
  ${WEBP_UTILS_ENC_SRCS})
+target_include_directories(webputils PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+                                             ${CMAKE_CURRENT_SOURCE_DIR}
+)
 add_library(webp $<TARGET_OBJECTS:webpdecode> $<TARGET_OBJECTS:webpdsp>
  $<TARGET_OBJECTS:webpencode> $<TARGET_OBJECTS:webputils>)
 target_link_libraries(webp ${WEBP_DEP_LIBRARIES})
+target_include_directories(webp
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+                                   ${CMAKE_CURRENT_BINARY_DIR}
+                           PUBLIC $<INSTALL_INTERFACE:include>
+)
+set_target_properties(webp PROPERTIES PUBLIC_HEADER
+"${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h"
+)

 # Make sure the OBJECT libraries are built with position independent code
 # (it is not ON by default).
@ -136,6 +180,17 @@ set_target_properties(webpdecode webpdspdecode webputilsdecode
 # Build the webp demux library.
 add_library(webpdemux ${WEBP_DEMUX_SRCS})
 target_link_libraries(webpdemux webp)
+target_include_directories(webpdemux
+                           PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+                                   ${CMAKE_CURRENT_BINARY_DIR}
+                           PUBLIC $<INSTALL_INTERFACE:include>
+)
+set_target_properties(webpdemux PROPERTIES PUBLIC_HEADER
+"${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h"
+)

 # Set the version numbers.
 function(parse_version FILE NAME VAR)
@ -183,6 +238,8 @@ if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
  list(APPEND EXAMPLEUTIL_SRCS
    ${CMAKE_CURRENT_SOURCE_DIR}/examples/stopwatch.h)
  add_library(exampleutil ${EXAMPLEUTIL_SRCS})
+  target_include_directories(exampleutil
+    PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>)

  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEIOUTILS_SRCS"
    "imageio_util_[^ ]*")
@ -193,7 +250,8 @@ if(WEBP_BUILD_CWEBP OR WEBP_BUILD_DWEBP OR
  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEDEC_SRCS"
    "imagedec_[^ ]*")
  add_library(imagedec ${IMAGEDEC_SRCS})
-  target_link_libraries(imagedec imageioutil webp ${WEBP_DEP_IMG_LIBRARIES})
+  target_link_libraries(imagedec imageioutil webpdemux webp
+    ${WEBP_DEP_IMG_LIBRARIES})

  # Image-encoding utility library.
  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/imageio "IMAGEENC_SRCS"
@ -208,26 +266,22 @@ endif()

 if(WEBP_BUILD_DWEBP)
  # dwebp
-  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "DWEBP_SRCS"
    "dwebp")
  add_executable(dwebp ${DWEBP_SRCS})
-  target_link_libraries(dwebp exampleutil imagedec imageenc webpdecoder)
-  install(TARGETS dwebp RUNTIME DESTINATION bin)
-  set_property(TARGET dwebp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_link_libraries(dwebp exampleutil imagedec imageenc)
+  target_include_directories(dwebp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS dwebp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if(WEBP_BUILD_CWEBP)
  # cwebp
-  include_directories(${WEBP_DEP_IMG_INCLUDE_DIRS})
  parse_Makefile_am(${CMAKE_CURRENT_SOURCE_DIR}/examples "CWEBP_SRCS"
    "cwebp")
  add_executable(cwebp ${CWEBP_SRCS})
  target_link_libraries(cwebp exampleutil imagedec webp)
-  install(TARGETS cwebp RUNTIME DESTINATION bin)
-  set_property(TARGET cwebp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(cwebp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS cwebp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if(WEBP_BUILD_GIF2WEBP AND NOT GIF_FOUND)
@ -239,9 +293,16 @@ if(WEBP_BUILD_GIF2WEBP OR WEBP_BUILD_IMG2WEBP)
    "")
  add_library(webpmux ${WEBP_MUX_SRCS})
  target_link_libraries(webpmux webp)
+  target_include_directories(webpmux
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
  parse_version(mux/Makefile.am webpmux WEBP_MUX_SOVERSION)
  set_target_properties(webpmux PROPERTIES VERSION ${PACKAGE_VERSION}
    SOVERSION ${WEBP_MUX_SOVERSION})
+  set_target_properties(webpmux PROPERTIES PUBLIC_HEADER
+"${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h;\
+${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h;"
+  )
  list(APPEND INSTALLED_LIBRARIES webpmux)
 endif()

@ -253,9 +314,8 @@ if(WEBP_BUILD_GIF2WEBP)
  add_executable(gif2webp ${GIF2WEBP_SRCS})
  target_link_libraries(gif2webp exampleutil imageioutil webp webpmux
    ${WEBP_DEP_GIF_LIBRARIES})
-  install(TARGETS gif2webp RUNTIME DESTINATION bin)
-  set_property(TARGET gif2webp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(gif2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS gif2webp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if(WEBP_BUILD_IMG2WEBP)
@ -265,9 +325,8 @@ if(WEBP_BUILD_IMG2WEBP)
    "img2webp")
  add_executable(img2webp ${IMG2WEBP_SRCS})
  target_link_libraries(img2webp exampleutil imagedec imageioutil webp webpmux)
-  install(TARGETS img2webp RUNTIME DESTINATION bin)
-  set_property(TARGET img2webp PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(img2webp PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS img2webp RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if (WEBP_BUILD_WEBPINFO)
@ -277,53 +336,51 @@ if (WEBP_BUILD_WEBPINFO)
    "webpinfo")
  add_executable(webpinfo ${WEBPINFO_SRCS})
  target_link_libraries(webpinfo exampleutil imageioutil)
-  install(TARGETS webpinfo RUNTIME DESTINATION bin)
-  set_property(TARGET webpinfo PROPERTY INCLUDE_DIRECTORIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src)
+  target_include_directories(webpinfo PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/src)
+  install(TARGETS webpinfo RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()

 if(WEBP_BUILD_WEBP_JS)
  # JavaScript version
-  add_executable(webp_js
-                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
+  add_executable(webp_js ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
  target_link_libraries(webp_js webpdecoder SDL)
+  target_include_directories(webp_js PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
  set(WEBP_HAVE_SDL 1)
  set_target_properties(webp_js PROPERTIES LINK_FLAGS
-      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
+      "-s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0 \
+       -s EXTRA_EXPORTED_RUNTIME_METHODS='[\"cwrap\"]'")
  set_target_properties(webp_js PROPERTIES OUTPUT_NAME webp)
  target_compile_definitions(webp_js PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)

  # WASM version
-  add_executable(webp_wasm
-                 ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
+  add_executable(webp_wasm ${CMAKE_CURRENT_SOURCE_DIR}/extras/webp_to_sdl.c)
  target_link_libraries(webp_wasm webpdecoder SDL)
+  target_include_directories(webp_wasm PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
  set_target_properties(webp_wasm PROPERTIES LINK_FLAGS
      "-s WASM=1 -s 'BINARYEN_METHOD=\"native-wasm\"' \
-      -s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0")
+       -s EXPORTED_FUNCTIONS='[\"_WebpToSDL\"]' -s INVOKE_RUN=0 \
+       -s EXTRA_EXPORTED_RUNTIME_METHODS='[\"cwrap\"]'")
  target_compile_definitions(webp_wasm PUBLIC EMSCRIPTEN WEBP_HAVE_SDL)

-  target_compile_definitions(webpdecoder PUBLIC EMSCRIPTEN)
+  target_compile_definitions(webpdspdecode PUBLIC EMSCRIPTEN)
 endif()

-# Generate the config.h file.
-configure_file(${CMAKE_CURRENT_LIST_DIR}/cmake/config.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/src/webp/config.h)
-add_definitions(-DHAVE_CONFIG_H)
-# The webp folder is included as we reference config.h as
-# ../webp/config.h or webp/config.h
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
 # Install the different headers and libraries.
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/decode.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/demux.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/encode.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/mux_types.h
-              ${CMAKE_CURRENT_SOURCE_DIR}/src/webp/types.h
-        DESTINATION include/webp)
-install(TARGETS ${INSTALLED_LIBRARIES}
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+include(GNUInstallDirs)
+install(
+  TARGETS ${INSTALLED_LIBRARIES}
+  EXPORT ${PROJECT_NAME}Targets
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/webp
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
+set(ConfigPackageLocation ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/cmake/)
+install(EXPORT ${PROJECT_NAME}Targets
+        NAMESPACE ${PROJECT_NAME}::
+        DESTINATION ${ConfigPackageLocation}
+)

 # Create the CMake version file.
 include(CMakePackageConfigHelpers)
@ -335,7 +392,6 @@ write_basic_package_version_file(

 # Create the Config file.
 include(CMakePackageConfigHelpers)
-set(ConfigPackageLocation share/WebP/cmake/)
 configure_package_config_file(
  ${CMAKE_CURRENT_SOURCE_DIR}/cmake/WebPConfig.cmake.in
  ${CMAKE_CURRENT_BINARY_DIR}/WebPConfig.cmake
@ -362,7 +418,7 @@ foreach(I_MAN RANGE ${MAN_PAGES_RANGE})
  if(WEBP_BUILD_${EXEC_BUILD})
    list(GET MAN_PAGES ${I_MAN} MAN_PAGE)
    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/man/${MAN_PAGE}
-      DESTINATION ${CMAKE_INSTALL_PREFIX}/share/man/man1
+      DESTINATION ${CMAKE_INSTALL_MANDIR}/man1
      COMPONENT doc
    )
  endif()
--- a/Makefile.vc
+++ b/Makefile.vc
@ -227,11 +227,13 @@ DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\upsampling_msa.obj \
    $(DIROBJ)\dsp\upsampling_neon.obj \
    $(DIROBJ)\dsp\upsampling_sse2.obj \
+    $(DIROBJ)\dsp\upsampling_sse41.obj \
    $(DIROBJ)\dsp\yuv.obj \
    $(DIROBJ)\dsp\yuv_mips32.obj \
    $(DIROBJ)\dsp\yuv_mips_dsp_r2.obj \
    $(DIROBJ)\dsp\yuv_neon.obj \
    $(DIROBJ)\dsp\yuv_sse2.obj \
+    $(DIROBJ)\dsp\yuv_sse41.obj \

 DSP_ENC_OBJS = \
    $(DIROBJ)\dsp\cost.obj \
@ -285,7 +287,6 @@ ENC_OBJS = \
    $(DIROBJ)\enc\backward_references_enc.obj \
    $(DIROBJ)\enc\config_enc.obj \
    $(DIROBJ)\enc\cost_enc.obj \
-    $(DIROBJ)\enc\delta_palettization_enc.obj \
    $(DIROBJ)\enc\filter_enc.obj \
    $(DIROBJ)\enc\frame_enc.obj \
    $(DIROBJ)\enc\histogram_enc.obj \
@ -367,9 +368,11 @@ $(DIRBIN)\anim_dump.exe: $(EX_GIF_DEC_OBJS) $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\anim_dump.exe: $(IMAGEIO_ENC_OBJS)
 $(DIRBIN)\cwebp.exe: $(DIROBJ)\examples\cwebp.obj $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\cwebp.exe: $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\cwebp.exe: $(LIBWEBPDEMUX)
 $(DIRBIN)\dwebp.exe: $(DIROBJ)\examples\dwebp.obj $(IMAGEIO_DEC_OBJS)
 $(DIRBIN)\dwebp.exe: $(IMAGEIO_ENC_OBJS)
 $(DIRBIN)\dwebp.exe: $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\dwebp.exe: $(LIBWEBPDEMUX)
 $(DIRBIN)\gif2webp.exe: $(DIROBJ)\examples\gif2webp.obj $(EX_GIF_DEC_OBJS)
 $(DIRBIN)\gif2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBPMUX)
 $(DIRBIN)\gif2webp.exe: $(LIBWEBP)
@ -382,26 +385,24 @@ $(DIRBIN)\webpmux.exe: $(DIROBJ)\examples\webpmux.obj $(LIBWEBPMUX)
 $(DIRBIN)\webpmux.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
 $(DIRBIN)\img2webp.exe: $(DIROBJ)\examples\img2webp.obj $(LIBWEBPMUX)
 $(DIRBIN)\img2webp.exe: $(IMAGEIO_DEC_OBJS)
-$(DIRBIN)\img2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
+$(DIRBIN)\img2webp.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\img2webp.exe: $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\get_disto.exe: $(DIROBJ)\extras\get_disto.obj
-$(DIRBIN)\get_disto.exe: $(IMAGEIO_DEC_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
+$(DIRBIN)\get_disto.exe: $(IMAGEIO_DEC_OBJS) $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\get_disto.exe: $(LIBWEBPDEMUX) $(LIBWEBP)
 $(DIRBIN)\webp_quality.exe: $(DIROBJ)\extras\webp_quality.obj
 $(DIRBIN)\webp_quality.exe: $(IMAGEIO_UTIL_OBJS)
 $(DIRBIN)\webp_quality.exe: $(EXTRAS_OBJS) $(LIBWEBP)
 $(DIRBIN)\webpinfo.exe: $(DIROBJ)\examples\webpinfo.obj
 $(DIRBIN)\webpinfo.exe: $(IMAGEIO_DEC_OBJS)
-$(DIRBIN)\webpinfo.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS) $(LIBWEBP)
+$(DIRBIN)\webpinfo.exe: $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS)
+$(DIRBIN)\webpinfo.exe: $(LIBWEBPDEMUX) $(LIBWEBP)

 $(OUT_EXAMPLES): $(EX_UTIL_OBJS) $(LIBWEBP)
 $(EX_UTIL_OBJS) $(IMAGEIO_UTIL_OBJS): $(OUTPUT_DIRS)
 $(IMAGEIO_DEC_OBJS) $(IMAGEIO_ENC_OBJS) $(EXTRAS_OBJS): $(OUTPUT_DIRS)
 !ENDIF  # ARCH == ARM

-experimental:
-	$(MAKE) /f Makefile.vc \
-	    CFG=$(CFG) \
-	    CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)
-
 $(LIBWEBPDECODER): $(LIBWEBPDECODER_OBJS)
 $(LIBWEBP): $(LIBWEBP_OBJS)
 $(LIBWEBPMUX): $(LIBWEBPMUX_OBJS)
--- a/README.mux
+++ b/README.mux
@ -33,6 +33,7 @@ Usage: webpmux -get GET_OPTIONS INPUT -o OUTPUT
       webpmux -info INPUT
       webpmux [-h|-help]
       webpmux -version
+       webpmux argument_file_name

 GET_OPTIONS:
 Extract relevant data:
@ -92,6 +93,9 @@ INPUT & OUTPUT are in WebP format.
 Note: The nature of EXIF, XMP and ICC data is not checked and is assumed to be
 valid.

+Note: if a single file name is passed as the argument, the arguments will be
+tokenized from this file. The file name must not start with the character '-'.
+
 Visualization tool:
 ===================

--- a/README.webp_js
+++ b/README.webp_js
@ -32,7 +32,8 @@ using Emscripten and CMake.
   webp.js.mem files generated.

 The callable JavaScript function is WebPToSDL(), which decodes a raw WebP
-bitstream into a canvas. See webp_js/index.html for a simple usage sample.
+bitstream into a canvas. See webp_js/index.html for a simple usage sample
+(see below for instructions).

 Demo HTML page:
 ===============
--- a/build.gradle
+++ b/build.gradle
@ -152,11 +152,13 @@ model {
            include "upsampling_msa.c"
            include "upsampling_neon.$NEON"
            include "upsampling_sse2.c"
+            include "upsampling_sse41.c"
            include "yuv.c"
            include "yuv_mips32.c"
            include "yuv_mips_dsp_r2.c"
            include "yuv_neon.$NEON"
            include "yuv_sse2.c"
+            include "yuv_sse41.c"
            srcDir "src/utils"
            include "bit_reader_utils.c"
            include "color_cache_utils.c"
@ -196,7 +198,6 @@ model {
            include "backward_references_enc.c"
            include "config_enc.c"
            include "cost_enc.c"
-            include "delta_palettization_enc.c"
            include "filter_enc.c"
            include "frame_enc.c"
            include "histogram_enc.c"
@ -288,6 +289,7 @@ model {
    imagedec(NativeLibrarySpec) {
      binaries {
        all {
+          lib library: "webpdemux", linkage: "static"
          lib library: "webp", linkage: "static"
        }
      }
@ -330,6 +332,7 @@ model {
          lib library: "example_util", linkage: "static"
          lib library: "imagedec", linkage: "static"
          lib library: "imageio_util", linkage: "static"
+          lib library: "webpdemux", linkage: "static"
          lib library: "webp", linkage: "static"
        }
      }
@ -350,6 +353,7 @@ model {
          lib library: "imagedec", linkage: "static"
          lib library: "imageenc", linkage: "static"
          lib library: "imageio_util", linkage: "static"
+          lib library: "webpdemux", linkage: "static"
          lib library: "webp"
        }
      }
@ -389,6 +393,7 @@ model {
          lib library: "imagedec", linkage: "static"
          lib library: "imageio_util", linkage: "static"
          lib library: "webpmux", linkage: "static"
+          lib library: "webpdemux", linkage: "static"
          lib library: "webp"
        }
      }
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@ -103,9 +103,6 @@
 /* Version number of package */
 #cmakedefine VERSION "@VERSION@"

-/* Enable experimental code */
-#cmakedefine WEBP_EXPERIMENTAL_FEATURES 1
-
 /* Set to 1 if AVX2 is supported */
 #cmakedefine WEBP_HAVE_AVX2 1

--- a/configure.ac
+++ b/configure.ac
@ -347,6 +347,8 @@ AS_IF([test "x$enable_gl" != "xno"], [
      # override with --with-gl*
      glut_cflags="$glut_cflags|-framework GLUT -framework OpenGL"
      glut_ldflags="$glut_ldflags|-framework GLUT -framework OpenGL"
+      # quiet deprecation warnings for glut
+      TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wno-deprecated-declarations])
      ;;
  esac

@ -443,31 +445,54 @@ AC_ARG_ENABLE([sdl],
                              @<:@default=auto@:>@]))
 AS_IF([test "x$enable_sdl" != "xno"], [
  CLEAR_LIBVARS([SDL])
+  AC_PATH_PROGS([LIBSDL_CONFIG], [sdl-config])
+  if test -n "$LIBSDL_CONFIG"; then
+    SDL_INCLUDES=`$LIBSDL_CONFIG --cflags`
+    SDL_LIBS="`$LIBSDL_CONFIG --libs`"
+  fi
+
  WITHLIB_OPTION([sdl], [SDL])

  sdl_header="no"
  LIBCHECK_PROLOGUE([SDL])
-  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL_SDL.h"],
+  AC_CHECK_HEADER([SDL/SDL.h], [sdl_header="SDL/SDL.h"],
                  [AC_CHECK_HEADER([SDL.h], [sdl_header="SDL.h"],
                  [AC_MSG_WARN(SDL library not available - no sdl.h)])])
  if test x"$sdl_header" != "xno"; then
-    AC_CHECK_LIB(SDL, SDL_Init,
-                 [SDL_LIBS="-lSDL"
-                  SDL_INCLUDES="-DWEBP_HAVE_SDL"
-                  AC_DEFINE(WEBP_HAVE_SDL, [1],
-                            [Set to 1 if SDL library is installed])
-                  sdl_support=yes
-                 ],
-                 AC_MSG_WARN(Optional SDL library not found),
-                 [$MATH_LIBS])
+    AC_LANG_PUSH(C)
+    SDL_SAVED_LIBS="$LIBS"
+    for lib in "" "-lSDL" "-lSDLmain -lSDL"; do
+      LIBS="$SDL_SAVED_LIBS $lib"
+      # Perform a full link to ensure SDL_main is resolved if needed.
+      AC_LINK_IFELSE(
+        [AC_LANG_SOURCE([
+           #include <$sdl_header>
+           int main(int argc, char** argv) {
+             SDL_Init(0);
+             return 0;
+           }])],
+        [SDL_LIBS="$LDFLAGS $LIBS"
+         SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_SDL"
+         AC_DEFINE(WEBP_HAVE_SDL, [1],
+                   [Set to 1 if SDL library is installed])
+         sdl_support=yes]
+      )
+      if test x"$sdl_support" = "xyes"; then
+        break
+      fi
+    done
+    # LIBS is restored by LIBCHECK_EPILOGUE
+    AC_LANG_POP
    if test x"$sdl_header" = "xSDL.h"; then
      SDL_INCLUDES="$SDL_INCLUDES -DWEBP_HAVE_JUST_SDL_H"
    fi
  fi
  LIBCHECK_EPILOGUE([SDL])

-  if test "$sdl_support" = "yes"; then
+  if test x"$sdl_support" = "xyes"; then
    build_vwebp_sdl=yes
+  else
+    AC_MSG_WARN(Optional SDL library not found)
  fi
 ])

@ -601,7 +626,7 @@ AS_IF([test "x$enable_gif" != "xno"], [
 AM_CONDITIONAL([BUILD_ANIMDIFF], [test "${build_anim_diff}" = "yes"])
 AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

-if test "$enable_libwebpmux" = "yes"; then
+if test "$enable_libwebpdemux" = "yes" -a "$enable_libwebpmux" = "yes"; then
  build_img2webp=yes
 fi
 AM_CONDITIONAL([BUILD_IMG2WEBP], [test "${build_img2webp}" = "yes"])
@ -676,19 +701,6 @@ fi
 AC_MSG_RESULT(${enable_swap_16bit_csp-no})
 AC_SUBST(USE_SWAP_16BIT_CSP)

-dnl === If --enable-experimental is defined, add -DWEBP_EXPERIMENTAL_FEATURES
-
-USE_EXPERIMENTAL_CODE=""
-AC_MSG_CHECKING(if --enable-experimental option is specified)
-AC_ARG_ENABLE([experimental], AS_HELP_STRING([--enable-experimental],
-                                             [Activate experimental features]))
-if test "$enable_experimental" = "yes"; then
-  AC_DEFINE(WEBP_EXPERIMENTAL_FEATURES, [1], [Enable experimental code])
-  USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
-fi
-AC_MSG_RESULT(${enable_experimental-no})
-AC_SUBST(USE_EXPERIMENTAL_CODE)
-
 dnl === If --disable-near-lossless is defined, add -DWEBP_NEAR_LOSSLESS=0

 AC_DEFINE(WEBP_NEAR_LOSSLESS, [1], [Enable near lossless encoding])
@ -715,8 +727,9 @@ AM_CONDITIONAL([WANT_MUX], [test "$enable_libwebpmux" = "yes"])
 dnl === Check whether libwebpdemux should be built
 AC_MSG_CHECKING(whether libwebpdemux is to be built)
 AC_ARG_ENABLE([libwebpdemux],
-              AS_HELP_STRING([--enable-libwebpdemux],
-                             [Build libwebpdemux @<:@default=no@:>@]))
+              AS_HELP_STRING([--disable-libwebpdemux],
+                             [Disable libwebpdemux @<:@default=no@:>@]),
+              [], [enable_libwebpdemux=yes])
 AC_MSG_RESULT(${enable_libwebpdemux-no})
 AM_CONDITIONAL([WANT_DEMUX], [test "$enable_libwebpdemux" = "yes"])

@ -765,14 +778,14 @@ libwebpmux: ${enable_libwebpmux-no}
 libwebpextras: ${enable_libwebpextras-no}

 Tools:
-cwebp : yes
+cwebp : ${enable_libwebpdemux-no}
  Input format support
  ====================
  JPEG : ${jpeg_support-no}
  PNG  : ${png_support-no}
  TIFF : ${tiff_support-no}
  WIC  : ${wic_support-no}
-dwebp : yes
+dwebp : ${enable_libwebpdemux-no}
  Output format support
  =====================
  PNG  : ${png_support-no}
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -27,7 +27,7 @@ LOCAL_SRC_FILES := \

 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
-LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webp
+LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webpdemux webp

 LOCAL_MODULE := cwebp

@ -43,8 +43,7 @@ LOCAL_SRC_FILES := \

 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
-LOCAL_STATIC_LIBRARIES := example_util imagedec imageenc webp
-
+LOCAL_STATIC_LIBRARIES := example_util imagedec imageenc webpdemux webp
 LOCAL_MODULE := dwebp

 include $(BUILD_EXECUTABLE)
@ -75,7 +74,8 @@ LOCAL_SRC_FILES := \

 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
-LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webpmux webp
+LOCAL_STATIC_LIBRARIES := example_util imageio_util imagedec webpmux webpdemux \
+                          webp

 LOCAL_MODULE := img2webp_example

--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@ -1,6 +1,9 @@
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src

-bin_PROGRAMS = dwebp cwebp
+bin_PROGRAMS =
+if WANT_DEMUX
+  bin_PROGRAMS += dwebp cwebp
+endif
 if BUILD_ANIMDIFF
  noinst_PROGRAMS = anim_diff anim_dump
 endif
@ -26,7 +29,7 @@ libexample_util_la_SOURCES = example_util.c example_util.h
 libexample_util_la_LIBADD = ../src/libwebp.la

 anim_diff_SOURCES = anim_diff.c anim_util.c anim_util.h
-anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+anim_diff_CPPFLAGS = $(AM_CPPFLAGS) $(GIF_INCLUDES)
 anim_diff_LDADD  =
 anim_diff_LDADD += ../src/demux/libwebpdemux.la
 anim_diff_LDADD += libexample_util.la
@ -34,7 +37,7 @@ anim_diff_LDADD += ../imageio/libimageio_util.la
 anim_diff_LDADD += $(GIF_LIBS) -lm

 anim_dump_SOURCES = anim_dump.c anim_util.c anim_util.h
-anim_dump_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(PNG_INCLUDES)
+anim_dump_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES)
 anim_dump_CPPFLAGS += $(GIF_INCLUDES)
 anim_dump_LDADD  =
 anim_dump_LDADD += ../src/demux/libwebpdemux.la
@ -44,7 +47,7 @@ anim_dump_LDADD += ../imageio/libimageenc.la
 anim_dump_LDADD += $(PNG_LIBS) $(GIF_LIBS) $(TIFF_LIBS) -lm

 cwebp_SOURCES  = cwebp.c stopwatch.h
-cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+cwebp_CPPFLAGS  = $(AM_CPPFLAGS)
 cwebp_LDADD  =
 cwebp_LDADD += libexample_util.la
 cwebp_LDADD += ../imageio/libimageio_util.la
@ -53,7 +56,7 @@ cwebp_LDADD += ../src/libwebp.la
 cwebp_LDADD += $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

 dwebp_SOURCES = dwebp.c stopwatch.h
-dwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+dwebp_CPPFLAGS  = $(AM_CPPFLAGS)
 dwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES)
 dwebp_LDADD  =
 dwebp_LDADD += libexample_util.la
@ -64,7 +67,7 @@ dwebp_LDADD += ../src/libwebp.la
 dwebp_LDADD +=$(PNG_LIBS) $(JPEG_LIBS)

 gif2webp_SOURCES = gif2webp.c gifdec.c gifdec.h
-gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
+gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(GIF_INCLUDES)
 gif2webp_LDADD  =
 gif2webp_LDADD += libexample_util.la
 gif2webp_LDADD += ../imageio/libimageio_util.la
@ -73,7 +76,7 @@ gif2webp_LDADD += ../src/libwebp.la
 gif2webp_LDADD += $(GIF_LIBS)

 vwebp_SOURCES = vwebp.c
-vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GL_INCLUDES)
+vwebp_CPPFLAGS = $(AM_CPPFLAGS) $(GL_INCLUDES)
 vwebp_LDADD  =
 vwebp_LDADD += libexample_util.la
 vwebp_LDADD += ../imageio/libimageio_util.la
@ -81,7 +84,7 @@ vwebp_LDADD += ../src/demux/libwebpdemux.la
 vwebp_LDADD += $(GL_LIBS)

 webpmux_SOURCES = webpmux.c
-webpmux_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webpmux_CPPFLAGS = $(AM_CPPFLAGS)
 webpmux_LDADD  =
 webpmux_LDADD += libexample_util.la
 webpmux_LDADD += ../imageio/libimageio_util.la
@ -89,7 +92,7 @@ webpmux_LDADD += ../src/mux/libwebpmux.la
 webpmux_LDADD += ../src/libwebp.la

 img2webp_SOURCES = img2webp.c
-img2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+img2webp_CPPFLAGS = $(AM_CPPFLAGS)
 img2webp_LDADD  =
 img2webp_LDADD += libexample_util.la
 img2webp_LDADD += ../imageio/libimageio_util.la
@ -99,7 +102,7 @@ img2webp_LDADD += ../src/libwebp.la
 img2webp_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webpinfo_SOURCES = webpinfo.c
-webpinfo_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webpinfo_CPPFLAGS = $(AM_CPPFLAGS)
 webpinfo_LDADD  =
 webpinfo_LDADD += libexample_util.la
 webpinfo_LDADD += ../imageio/libimageio_util.la
--- a/examples/anim_diff.c
+++ b/examples/anim_diff.c
@ -187,11 +187,9 @@ static void Help(void) {
  printf("  -min_psnr <float> ... minimum per-frame PSNR\n");
  printf("  -raw_comparison ..... if this flag is not used, RGB is\n");
  printf("                        premultiplied before comparison\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  printf("  -max_diff <int> ..... maximum allowed difference per channel "
-         "                        between corresponding pixels in subsequent"
+  printf("  -max_diff <int> ..... maximum allowed difference per channel\n"
+         "                        between corresponding pixels in subsequent\n"
         "                        frames\n");
-#endif
 }

 int main(int argc, const char* argv[]) {
@ -236,7 +234,6 @@ int main(int argc, const char* argv[]) {
      }
    } else if (!strcmp(argv[c], "-raw_comparison")) {
      premultiply = 0;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
    } else if (!strcmp(argv[c], "-max_diff")) {
      if (c < argc - 1) {
        const char* const v = argv[++c];
@ -250,7 +247,6 @@ int main(int argc, const char* argv[]) {
      } else {
        parse_error = 1;
      }
-#endif
    } else {
      if (!got_input1) {
        files[0] = argv[c];
--- a/examples/anim_util.c
+++ b/examples/anim_util.c
@ -593,6 +593,9 @@ static int ReadAnimatedGIF(const char filename[], AnimatedImage* const image,
    curr_frame = &image->frames[i];
    curr_rgba = curr_frame->rgba;
    curr_frame->duration = GetFrameDurationGIF(gif, i);
+    // Force frames with a small or no duration to 100ms to be consistent
+    // with web browsers and other transcoding tools (like gif2webp itself).
+    if (curr_frame->duration <= 10) curr_frame->duration = 100;

    if (i == 0) {  // Initialize as transparent.
      curr_frame->is_key_frame = 1;
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -140,10 +140,11 @@ static void PrintByteCount(const int bytes[4], int total_size,
  fprintf(stderr, "| %7d  (%.1f%%)\n", total, 100.f * total / total_size);
 }

-static void PrintPercents(const int counts[4], int total) {
+static void PrintPercents(const int counts[4]) {
  int s;
+  const int total = counts[0] + counts[1] + counts[2] + counts[3];
  for (s = 0; s < 4; ++s) {
-    fprintf(stderr, "|      %2d%%", 100 * counts[s] / total);
+    fprintf(stderr, "|      %2d%%", (int)(100. * counts[s] / total + .5));
  }
  fprintf(stderr, "| %7d\n", total);
 }
@ -186,7 +187,8 @@ static void PrintExtraInfoLossless(const WebPPicture* const pic,
  } else {
    fprintf(stderr, "File:      %s\n", file_name);
    fprintf(stderr, "Dimension: %d x %d\n", pic->width, pic->height);
-    fprintf(stderr, "Output:    %d bytes\n", stats->coded_size);
+    fprintf(stderr, "Output:    %d bytes (%.2f bpp)\n", stats->coded_size,
+            8.f * stats->coded_size / pic->width / pic->height);
    PrintFullLosslessInfo(stats, "ARGB");
  }
 }
@ -207,15 +209,18 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
            pic->width, pic->height,
            stats->alpha_data_size ? " (with alpha)" : "");
    fprintf(stderr, "Output:    "
-            "%d bytes Y-U-V-All-PSNR %2.2f %2.2f %2.2f   %2.2f dB\n",
+            "%d bytes Y-U-V-All-PSNR %2.2f %2.2f %2.2f   %2.2f dB\n"
+            "           (%.2f bpp)\n",
            stats->coded_size,
-            stats->PSNR[0], stats->PSNR[1], stats->PSNR[2], stats->PSNR[3]);
+            stats->PSNR[0], stats->PSNR[1], stats->PSNR[2], stats->PSNR[3],
+            8.f * stats->coded_size / pic->width / pic->height);
    if (total > 0) {
      int totals[4] = { 0, 0, 0, 0 };
-      fprintf(stderr, "block count:  intra4: %d\n"
-                      "              intra16: %d  (-> %.2f%%)\n",
-              num_i4, num_i16, 100.f * num_i16 / total);
-      fprintf(stderr, "              skipped block: %d (%.2f%%)\n",
+      fprintf(stderr, "block count:  intra4:     %6d  (%.2f%%)\n"
+                      "              intra16:    %6d  (%.2f%%)\n"
+                      "              skipped:    %6d  (%.2f%%)\n",
+              num_i4, 100.f * num_i4 / total,
+              num_i16, 100.f * num_i16 / total,
              num_skip, 100.f * num_skip / total);
      fprintf(stderr, "bytes used:  header:         %6d  (%.1f%%)\n"
                      "             mode-partition: %6d  (%.1f%%)\n",
@ -239,7 +244,7 @@ static void PrintExtraInfoLossy(const WebPPicture* const pic, int short_output,
        PrintByteCount(stats->residual_bytes[2], stats->coded_size, totals);
      }
      fprintf(stderr, "    macroblocks:  ");
-      PrintPercents(stats->segment_size, total);
+      PrintPercents(stats->segment_size);
      fprintf(stderr, "      quantizer:  ");
      PrintValues(stats->segment_quant);
      fprintf(stderr, "   filter level:  ");
@ -580,9 +585,6 @@ static void HelpLong(void) {
  printf("  -near_lossless <int> ... use near-lossless image\n"
         "                           preprocessing (0..100=off), "
         "default=100\n");
-#ifdef WEBP_EXPERIMENTAL_FEATURES  /* not documented yet */
-  printf("  -delta_palette ......... use delta palettization\n");
-#endif  // WEBP_EXPERIMENTAL_FEATURES
  printf("  -hint <string> ......... specify image characteristics hint,\n");
  printf("                           one of: photo, picture or graph\n");

@ -751,11 +753,6 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-near_lossless") && c < argc - 1) {
      config.near_lossless = ExUtilGetInt(argv[++c], 0, &parse_error);
      config.lossless = 1;  // use near-lossless only with lossless
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    } else if (!strcmp(argv[c], "-delta_palette")) {
-      config.use_delta_palette = 1;
-      config.lossless = 1;  // delta-palette is for lossless only
-#endif  // WEBP_EXPERIMENTAL_FEATURES
    } else if (!strcmp(argv[c], "-hint") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "photo")) {
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -12,10 +12,14 @@

 #include "./example_util.h"

+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>

+#include "webp/mux_types.h"
+#include "../imageio/imageio_util.h"
+
 //------------------------------------------------------------------------------
 // String parsing

@ -56,3 +60,68 @@ float ExUtilGetFloat(const char* const v, int* const error) {
  }
  return f;
 }
+
+//------------------------------------------------------------------------------
+
+static void ResetCommandLineArguments(int argc, const char* argv[],
+                                      CommandLineArguments* const args) {
+  assert(args != NULL);
+  args->argc_ = argc;
+  args->argv_ = argv;
+  args->own_argv_ = 0;
+  WebPDataInit(&args->argv_data_);
+}
+
+void ExUtilDeleteCommandLineArguments(CommandLineArguments* const args) {
+  if (args != NULL) {
+    if (args->own_argv_) {
+      free((void*)args->argv_);
+      WebPDataClear(&args->argv_data_);
+    }
+    ResetCommandLineArguments(0, NULL, args);
+  }
+}
+
+#define MAX_ARGC 16384
+int ExUtilInitCommandLineArguments(int argc, const char* argv[],
+                                   CommandLineArguments* const args) {
+  if (args == NULL || argv == NULL) return 0;
+  ResetCommandLineArguments(argc, argv, args);
+  if (argc == 1 && argv[0][0] != '-') {
+    char* cur;
+    const char sep[] = " \t\r\n\f\v";
+    if (!ExUtilReadFileToWebPData(argv[0], &args->argv_data_)) {
+      return 0;
+    }
+    args->own_argv_ = 1;
+    args->argv_ = (const char**)malloc(MAX_ARGC * sizeof(*args->argv_));
+    if (args->argv_ == NULL) return 0;
+
+    argc = 0;
+    for (cur = strtok((char*)args->argv_data_.bytes, sep);
+         cur != NULL;
+         cur = strtok(NULL, sep)) {
+      if (argc == MAX_ARGC) {
+        fprintf(stderr, "ERROR: Arguments limit %d reached\n", MAX_ARGC);
+        return 0;
+      }
+      assert(strlen(cur) != 0);
+      args->argv_[argc++] = cur;
+    }
+    args->argc_ = argc;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+int ExUtilReadFileToWebPData(const char* const filename,
+                             WebPData* const webp_data) {
+  const uint8_t* data;
+  size_t size;
+  if (webp_data == NULL) return 0;
+  if (!ImgIoUtilReadFile(filename, &data, &size)) return 0;
+  webp_data->bytes = data;
+  webp_data->size = size;
+  return 1;
+}
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -14,6 +14,7 @@
 #define WEBP_EXAMPLES_EXAMPLE_UTIL_H_

 #include "webp/types.h"
+#include "webp/mux_types.h"

 #ifdef __cplusplus
 extern "C" {
@ -35,6 +36,33 @@ float ExUtilGetFloat(const char* const v, int* const error);
 // actually parsed is returned, or -1 if an error occurred.
 int ExUtilGetInts(const char* v, int base, int max_output, int output[]);

+// Reads a file named 'filename' into a WebPData structure. The content of
+// webp_data is overwritten. Returns false in case of error.
+int ExUtilReadFileToWebPData(const char* const filename,
+                             WebPData* const webp_data);
+
+//------------------------------------------------------------------------------
+// Command-line arguments
+
+typedef struct {
+  int argc_;
+  const char** argv_;
+  WebPData argv_data_;
+  int own_argv_;
+} CommandLineArguments;
+
+// Initializes the structure from the command-line parameters. If there is
+// only one parameter and it does not start with a '-', then it is assumed to
+// be a file name. This file will be read and tokenized into command-line
+// arguments. The content of 'args' is overwritten.
+// Returns false in case of error (memory allocation failure, non
+// existing file, too many arguments, ...).
+int ExUtilInitCommandLineArguments(int argc, const char* argv[],
+                                   CommandLineArguments* const args);
+
+// Deallocate all memory and reset 'args'.
+void ExUtilDeleteCommandLineArguments(CommandLineArguments* const args);
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -23,6 +23,10 @@

 #ifdef WEBP_HAVE_GIF

+#if defined(HAVE_UNISTD_H) && HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
 #include <gif_lib.h>
 #include "webp/encode.h"
 #include "webp/mux.h"
@ -30,6 +34,10 @@
 #include "../imageio/imageio_util.h"
 #include "./gifdec.h"

+#if !defined(STDIN_FILENO)
+#define STDIN_FILENO 0
+#endif
+
 //------------------------------------------------------------------------------

 static int transparent_index = GIF_INDEX_INVALID;  // Opaque by default.
@ -263,9 +271,11 @@ int main(int argc, const char *argv[]) {

  // Start the decoder object
 #if LOCAL_GIF_PREREQ(5,0)
-  gif = DGifOpenFileName(in_file, &gif_error);
+  gif = !strcmp(in_file, "-") ? DGifOpenFileHandle(STDIN_FILENO, &gif_error)
+                              : DGifOpenFileName(in_file, &gif_error);
 #else
-  gif = DGifOpenFileName(in_file);
+  gif = !strcmp(in_file, "-") ? DGifOpenFileHandle(STDIN_FILENO)
+                              : DGifOpenFileName(in_file);
 #endif
  if (gif == NULL) goto End;

@ -351,6 +361,14 @@ int main(int argc, const char *argv[]) {
        GIFDisposeFrame(orig_dispose, &gif_rect, &prev_canvas, &curr_canvas);
        GIFCopyPixels(&curr_canvas, &prev_canvas);

+        // Force frames with a small or no duration to 100ms to be consistent
+        // with web browsers and other transcoding tools. This also avoids
+        // incorrect durations between frames when padding frames are
+        // discarded.
+        if (frame_duration <= 10) {
+          frame_duration = 100;
+        }
+
        // Update timestamp (for next frame).
        frame_timestamp += frame_duration;

@ -532,8 +550,13 @@ int main(int argc, const char *argv[]) {
      goto End;
    }
    if (!quiet) {
-      fprintf(stderr, "Saved output file (%d bytes): %s\n",
-              (int)webp_data.size, out_file);
+      if (!strcmp(out_file, "-")) {
+        fprintf(stderr, "Saved %d bytes to STDIO\n",
+                (int)webp_data.size);
+      } else {
+        fprintf(stderr, "Saved output file (%d bytes): %s\n",
+                (int)webp_data.size, out_file);
+      }
    }
  } else {
    if (!quiet) {
--- a/examples/img2webp.c
+++ b/examples/img2webp.c
@ -117,14 +117,13 @@ static int SetLoopCount(int loop_count, WebPData* const webp_data) {

 //------------------------------------------------------------------------------

-int main(int argc, char* argv[]) {
+int main(int argc, const char* argv[]) {
  const char* output = NULL;
  WebPAnimEncoder* enc = NULL;
  int verbose = 0;
  int pic_num = 0;
  int duration = 100;
  int timestamp_ms = 0;
-  int ok = 1;
  int loop_count = 0;
  int width = 0, height = 0;
  WebPAnimEncoderOptions anim_config;
@ -133,17 +132,23 @@ int main(int argc, char* argv[]) {
  WebPData webp_data;
  int c;
  int have_input = 0;
+  CommandLineArguments cmd_args;
+  int ok = ExUtilInitCommandLineArguments(argc - 1, argv + 1, &cmd_args);
+  if (!ok) return 1;
+  argc = cmd_args.argc_;
+  argv = cmd_args.argv_;

  WebPDataInit(&webp_data);
  if (!WebPAnimEncoderOptionsInit(&anim_config) ||
      !WebPConfigInit(&config) ||
      !WebPPictureInit(&pic)) {
    fprintf(stderr, "Library version mismatch!\n");
-    return 1;
+    ok = 0;
+    goto End;
  }

  // 1st pass of option parsing
-  for (c = 1; ok && c < argc; ++c) {
+  for (c = 0; ok && c < argc; ++c) {
    if (argv[c][0] == '-') {
      int parse_error = 0;
      if (!strcmp(argv[c], "-o") && c + 1 < argc) {
@ -171,7 +176,7 @@ int main(int argc, char* argv[]) {
        verbose = 1;
      } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
        Help();
-        return 0;
+        goto End;
      } else {
        continue;
      }
@ -184,13 +189,13 @@ int main(int argc, char* argv[]) {
  }
  if (!have_input) {
    fprintf(stderr, "No input file(s) for generating animation!\n");
-    return 0;
+    goto End;
  }

  // image-reading pass
  pic_num = 0;
  config.lossless = 1;
-  for (c = 1; ok && c < argc; ++c) {
+  for (c = 0; ok && c < argc; ++c) {
    if (argv[c] == NULL) continue;
    if (argv[c][0] == '-') {    // parse local options
      int parse_error = 0;
@ -294,7 +299,7 @@ int main(int argc, char* argv[]) {
    fprintf(stderr, "[%d frames, %u bytes].\n",
            pic_num, (unsigned int)webp_data.size);
  }
-
  WebPDataClear(&webp_data);
+  ExUtilDeleteCommandLineArguments(&cmd_args);
  return ok ? 0 : 1;
 }
--- a/examples/webpinfo.c
+++ b/examples/webpinfo.c
@ -340,7 +340,7 @@ static WebPInfoStatus ParseLossyHeader(const ChunkData* const chunk_data,
  WebPInfoStatus status = WEBP_INFO_OK;
  uint64_t bit_position = 0;
  uint64_t* const bit_pos = &bit_position;
-  int color_space, clamp_type;
+  int colorspace, clamp_type;
  printf("  Parsing lossy bitstream...\n");
  // Calling WebPGetFeatures() in ProcessImageChunk() should ensure this.
  assert(chunk_data->size_ >= CHUNK_HEADER_SIZE + 10);
@ -381,9 +381,9 @@ static WebPInfoStatus ParseLossyHeader(const ChunkData* const chunk_data,
    LOG_ERROR("Bad partition length.");
    return WEBP_INFO_BITSTREAM_ERROR;
  }
-  GET_BITS(color_space, 1);
+  GET_BITS(colorspace, 1);
  GET_BITS(clamp_type, 1);
-  printf("  Color space:      %d\n", color_space);
+  printf("  Color space:      %d\n", colorspace);
  printf("  Clamp type:       %d\n", clamp_type);
  status = ParseLossySegmentHeader(webp_info, data, data_size, bit_pos);
  if (status != WEBP_INFO_OK) return status;
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -47,6 +47,7 @@
    webpmux -info in.webp
    webpmux [ -h | -help ]
    webpmux -version
+    webpmux argument_file_name
 */

 #ifdef HAVE_CONFIG_H
@ -108,28 +109,26 @@ static const char* const kDescriptions[LAST_FEATURE] = {
 };

 typedef struct {
-  FeatureType type_;
-  FeatureArg* args_;
-  int arg_count_;
-} Feature;
+  CommandLineArguments cmd_args_;

-typedef struct {
  ActionType action_type_;
  const char* input_;
  const char* output_;
-  Feature feature_;
-} WebPMuxConfig;
+  FeatureType type_;
+  FeatureArg* args_;
+  int arg_count_;
+} Config;

 //------------------------------------------------------------------------------
 // Helper functions.

-static int CountOccurrences(const char* arglist[], int list_length,
-                            const char* arg) {
+static int CountOccurrences(const CommandLineArguments* const args,
+                            const char* const arg) {
  int i;
  int num_occurences = 0;

-  for (i = 0; i < list_length; ++i) {
-    if (!strcmp(arglist[i], arg)) {
+  for (i = 0; i < args->argc_; ++i) {
+    if (!strcmp(args->argv_[i], arg)) {
      ++num_occurences;
    }
  }
@ -301,6 +300,7 @@ static void PrintHelp(void) {
  printf("       webpmux -info INPUT\n");
  printf("       webpmux [-h|-help]\n");
  printf("       webpmux -version\n");
+  printf("       webpmux argument_file_name\n");

  printf("\n");
  printf("GET_OPTIONS:\n");
@ -369,6 +369,10 @@ static void PrintHelp(void) {

  printf("\nNote: The nature of EXIF, XMP and ICC data is not checked");
  printf(" and is assumed to be\nvalid.\n");
+  printf("\nNote: if a single file name is passed as the argument, the "
+         "arguments will be\n");
+  printf("tokenized from this file. The file name must not start with "
+         "the character '-'.\n");
 }

 static void WarnAboutOddOffset(const WebPMuxFrameInfo* const info) {
@ -379,22 +383,12 @@ static void WarnAboutOddOffset(const WebPMuxFrameInfo* const info) {
  }
 }

-static int ReadFileToWebPData(const char* const filename,
-                              WebPData* const webp_data) {
-  const uint8_t* data;
-  size_t size;
-  if (!ImgIoUtilReadFile(filename, &data, &size)) return 0;
-  webp_data->bytes = data;
-  webp_data->size = size;
-  return 1;
-}
-
 static int CreateMux(const char* const filename, WebPMux** mux) {
  WebPData bitstream;
  assert(mux != NULL);
-  if (!ReadFileToWebPData(filename, &bitstream)) return 0;
+  if (!ExUtilReadFileToWebPData(filename, &bitstream)) return 0;
  *mux = WebPMuxCreate(&bitstream, 1);
-  free((void*)bitstream.bytes);
+  WebPDataClear(&bitstream);
  if (*mux != NULL) return 1;
  fprintf(stderr, "Failed to create mux object from file %s.\n", filename);
  return 0;
@ -517,9 +511,10 @@ static int ParseBgcolorArgs(const char* args, uint32_t* const bgcolor) {
 //------------------------------------------------------------------------------
 // Clean-up.

-static void DeleteConfig(WebPMuxConfig* config) {
+static void DeleteConfig(Config* const config) {
  if (config != NULL) {
-    free(config->feature_.args_);
+    free(config->args_);
+    ExUtilDeleteCommandLineArguments(&config->cmd_args_);
    memset(config, 0, sizeof(*config));
  }
 }
@ -531,7 +526,7 @@ static void DeleteConfig(WebPMuxConfig* config) {
 // Returns 1 on valid, 0 otherwise.
 // Also fills up num_feature_args to be number of feature arguments given.
 // (e.g. if there are 4 '-frame's and 1 '-loop', then num_feature_args = 5).
-static int ValidateCommandLine(int argc, const char* argv[],
+static int ValidateCommandLine(const CommandLineArguments* const cmd_args,
                               int* num_feature_args) {
  int num_frame_args;
  int num_loop_args;
@ -543,27 +538,27 @@ static int ValidateCommandLine(int argc, const char* argv[],
  *num_feature_args = 0;

  // Simple checks.
-  if (CountOccurrences(argv, argc, "-get") > 1) {
+  if (CountOccurrences(cmd_args, "-get") > 1) {
    ERROR_GOTO1("ERROR: Multiple '-get' arguments specified.\n", ErrValidate);
  }
-  if (CountOccurrences(argv, argc, "-set") > 1) {
+  if (CountOccurrences(cmd_args, "-set") > 1) {
    ERROR_GOTO1("ERROR: Multiple '-set' arguments specified.\n", ErrValidate);
  }
-  if (CountOccurrences(argv, argc, "-strip") > 1) {
+  if (CountOccurrences(cmd_args, "-strip") > 1) {
    ERROR_GOTO1("ERROR: Multiple '-strip' arguments specified.\n", ErrValidate);
  }
-  if (CountOccurrences(argv, argc, "-info") > 1) {
+  if (CountOccurrences(cmd_args, "-info") > 1) {
    ERROR_GOTO1("ERROR: Multiple '-info' arguments specified.\n", ErrValidate);
  }
-  if (CountOccurrences(argv, argc, "-o") > 1) {
+  if (CountOccurrences(cmd_args, "-o") > 1) {
    ERROR_GOTO1("ERROR: Multiple output files specified.\n", ErrValidate);
  }

  // Compound checks.
-  num_frame_args = CountOccurrences(argv, argc, "-frame");
-  num_loop_args = CountOccurrences(argv, argc, "-loop");
-  num_bgcolor_args = CountOccurrences(argv, argc, "-bgcolor");
-  num_durations_args = CountOccurrences(argv, argc, "-duration");
+  num_frame_args = CountOccurrences(cmd_args, "-frame");
+  num_loop_args = CountOccurrences(cmd_args, "-loop");
+  num_bgcolor_args = CountOccurrences(cmd_args, "-bgcolor");
+  num_durations_args = CountOccurrences(cmd_args, "-duration");

  if (num_loop_args > 1) {
    ERROR_GOTO1("ERROR: Multiple loop counts specified.\n", ErrValidate);
@ -598,7 +593,7 @@ static int ValidateCommandLine(int argc, const char* argv[],

 #define ACTION_IS_NIL (config->action_type_ == NIL_ACTION)

-#define FEATURETYPE_IS_NIL (feature->type_ == NIL_FEATURE)
+#define FEATURETYPE_IS_NIL (config->type_ == NIL_FEATURE)

 #define CHECK_NUM_ARGS_LESS(NUM, LABEL)                                  \
  if (argc < i + (NUM)) {                                                \
@ -614,15 +609,15 @@ static int ValidateCommandLine(int argc, const char* argv[],

 // Parses command-line arguments to fill up config object. Also performs some
 // semantic checks.
-static int ParseCommandLine(int argc, const char* argv[],
-                            WebPMuxConfig* config) {
+static int ParseCommandLine(Config* config) {
  int i = 0;
  int feature_arg_index = 0;
  int ok = 1;
+  int argc = config->cmd_args_.argc_;
+  const char* const* argv = config->cmd_args_.argv_;

  while (i < argc) {
-    Feature* const feature = &config->feature_;
-    FeatureArg* const arg = &feature->args_[feature_arg_index];
+    FeatureArg* const arg = &config->args_[feature_arg_index];
    if (argv[i][0] == '-') {  // One of the action types or output.
      if (!strcmp(argv[i], "-set")) {
        if (ACTION_IS_NIL) {
@ -638,8 +633,8 @@ static int ParseCommandLine(int argc, const char* argv[],
        } else {
          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
        }
-        if (FEATURETYPE_IS_NIL || feature->type_ == FEATURE_DURATION) {
-          feature->type_ = FEATURE_DURATION;
+        if (FEATURETYPE_IS_NIL || config->type_ == FEATURE_DURATION) {
+          config->type_ = FEATURE_DURATION;
        } else {
          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
        }
@ -656,7 +651,7 @@ static int ParseCommandLine(int argc, const char* argv[],
      } else if (!strcmp(argv[i], "-strip")) {
        if (ACTION_IS_NIL) {
          config->action_type_ = ACTION_STRIP;
-          feature->arg_count_ = 0;
+          config->arg_count_ = 0;
        } else {
          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
        }
@ -668,8 +663,8 @@ static int ParseCommandLine(int argc, const char* argv[],
        } else {
          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
        }
-        if (FEATURETYPE_IS_NIL || feature->type_ == FEATURE_ANMF) {
-          feature->type_ = FEATURE_ANMF;
+        if (FEATURETYPE_IS_NIL || config->type_ == FEATURE_ANMF) {
+          config->type_ = FEATURE_ANMF;
        } else {
          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
        }
@ -685,8 +680,8 @@ static int ParseCommandLine(int argc, const char* argv[],
        } else {
          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
        }
-        if (FEATURETYPE_IS_NIL || feature->type_ == FEATURE_ANMF) {
-          feature->type_ = FEATURE_ANMF;
+        if (FEATURETYPE_IS_NIL || config->type_ == FEATURE_ANMF) {
+          config->type_ = FEATURE_ANMF;
        } else {
          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
        }
@ -705,7 +700,7 @@ static int ParseCommandLine(int argc, const char* argv[],
          ERROR_GOTO1("ERROR: Multiple actions specified.\n", ErrParse);
        } else {
          config->action_type_ = ACTION_INFO;
-          feature->arg_count_ = 0;
+          config->arg_count_ = 0;
          config->input_ = argv[i + 1];
        }
        i += 2;
@ -741,7 +736,7 @@ static int ParseCommandLine(int argc, const char* argv[],
      if (!strcmp(argv[i], "icc") || !strcmp(argv[i], "exif") ||
          !strcmp(argv[i], "xmp")) {
        if (FEATURETYPE_IS_NIL) {
-          feature->type_ = (!strcmp(argv[i], "icc")) ? FEATURE_ICCP :
+          config->type_ = (!strcmp(argv[i], "icc")) ? FEATURE_ICCP :
              (!strcmp(argv[i], "exif")) ? FEATURE_EXIF : FEATURE_XMP;
        } else {
          ERROR_GOTO1("ERROR: Multiple features specified.\n", ErrParse);
@ -757,7 +752,7 @@ static int ParseCommandLine(int argc, const char* argv[],
      } else if (!strcmp(argv[i], "frame") &&
                 (config->action_type_ == ACTION_GET)) {
        CHECK_NUM_ARGS_LESS(2, ErrParse);
-        feature->type_ = FEATURE_ANMF;
+        config->type_ = FEATURE_ANMF;
        arg->params_ = argv[i + 1];
        ++feature_arg_index;
        i += 2;
@ -777,9 +772,8 @@ static int ParseCommandLine(int argc, const char* argv[],
 }

 // Additional checks after config is filled.
-static int ValidateConfig(WebPMuxConfig* config) {
+static int ValidateConfig(Config* const config) {
  int ok = 1;
-  Feature* const feature = &config->feature_;

  // Action.
  if (ACTION_IS_NIL) {
@ -795,7 +789,7 @@ static int ValidateConfig(WebPMuxConfig* config) {
  if (config->input_ == NULL) {
    if (config->action_type_ != ACTION_SET) {
      ERROR_GOTO1("ERROR: No input file specified.\n", ErrValidate2);
-    } else if (feature->type_ != FEATURE_ANMF) {
+    } else if (config->type_ != FEATURE_ANMF) {
      ERROR_GOTO1("ERROR: No input file specified.\n", ErrValidate2);
    }
  }
@ -811,27 +805,28 @@ static int ValidateConfig(WebPMuxConfig* config) {

 // Create config object from command-line arguments.
 static int InitializeConfig(int argc, const char* argv[],
-                            WebPMuxConfig* config) {
+                            Config* const config) {
  int num_feature_args = 0;
-  int ok = 1;
+  int ok;

-  assert(config != NULL);
  memset(config, 0, sizeof(*config));

+  ok = ExUtilInitCommandLineArguments(argc, argv, &config->cmd_args_);
+  if (!ok) return 0;
+
  // Validate command-line arguments.
-  if (!ValidateCommandLine(argc, argv, &num_feature_args)) {
+  if (!ValidateCommandLine(&config->cmd_args_, &num_feature_args)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }

-  config->feature_.arg_count_ = num_feature_args;
-  config->feature_.args_ =
-      (FeatureArg*)calloc(num_feature_args, sizeof(*config->feature_.args_));
-  if (config->feature_.args_ == NULL) {
+  config->arg_count_ = num_feature_args;
+  config->args_ = (FeatureArg*)calloc(num_feature_args, sizeof(*config->args_));
+  if (config->args_ == NULL) {
    ERROR_GOTO1("ERROR: Memory allocation error.\n", Err1);
  }

  // Parse command-line.
-  if (!ParseCommandLine(argc, argv, config) || !ValidateConfig(config)) {
+  if (!ParseCommandLine(config) || !ValidateConfig(config)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }

@ -847,7 +842,7 @@ static int InitializeConfig(int argc, const char* argv[],
 //------------------------------------------------------------------------------
 // Processing.

-static int GetFrame(const WebPMux* mux, const WebPMuxConfig* config) {
+static int GetFrame(const WebPMux* mux, const Config* config) {
  WebPMuxError err = WEBP_MUX_OK;
  WebPMux* mux_single = NULL;
  int num = 0;
@ -857,7 +852,7 @@ static int GetFrame(const WebPMux* mux, const WebPMuxConfig* config) {
  WebPMuxFrameInfo info;
  WebPDataInit(&info.bitstream);

-  num = ExUtilGetInt(config->feature_.args_[0].params_, 10, &parse_error);
+  num = ExUtilGetInt(config->args_[0].params_, 10, &parse_error);
  if (num < 0) {
    ERROR_GOTO1("ERROR: Frame/Fragment index must be non-negative.\n", ErrGet);
  }
@ -891,18 +886,17 @@ static int GetFrame(const WebPMux* mux, const WebPMuxConfig* config) {
 }

 // Read and process config.
-static int Process(const WebPMuxConfig* config) {
+static int Process(const Config* config) {
  WebPMux* mux = NULL;
  WebPData chunk;
  WebPMuxError err = WEBP_MUX_OK;
  int ok = 1;
-  const Feature* const feature = &config->feature_;

  switch (config->action_type_) {
    case ACTION_GET: {
      ok = CreateMux(config->input_, &mux);
      if (!ok) goto Err2;
-      switch (feature->type_) {
+      switch (config->type_) {
        case FEATURE_ANMF:
          ok = GetFrame(mux, config);
          break;
@ -910,10 +904,10 @@ static int Process(const WebPMuxConfig* config) {
        case FEATURE_ICCP:
        case FEATURE_EXIF:
        case FEATURE_XMP:
-          err = WebPMuxGetChunk(mux, kFourccList[feature->type_], &chunk);
+          err = WebPMuxGetChunk(mux, kFourccList[config->type_], &chunk);
          if (err != WEBP_MUX_OK) {
            ERROR_GOTO3("ERROR (%s): Could not get the %s.\n",
-                        ErrorString(err), kDescriptions[feature->type_], Err2);
+                        ErrorString(err), kDescriptions[config->type_], Err2);
          }
          ok = WriteData(config->output_, &chunk);
          break;
@ -925,7 +919,7 @@ static int Process(const WebPMuxConfig* config) {
      break;
    }
    case ACTION_SET: {
-      switch (feature->type_) {
+      switch (config->type_) {
        case FEATURE_ANMF: {
          int i;
          WebPMuxAnimParams params = { 0xFFFFFFFF, 0 };
@ -934,11 +928,11 @@ static int Process(const WebPMuxConfig* config) {
            ERROR_GOTO2("ERROR (%s): Could not allocate a mux object.\n",
                        ErrorString(WEBP_MUX_MEMORY_ERROR), Err2);
          }
-          for (i = 0; i < feature->arg_count_; ++i) {
-            switch (feature->args_[i].subtype_) {
+          for (i = 0; i < config->arg_count_; ++i) {
+            switch (config->args_[i].subtype_) {
              case SUBTYPE_BGCOLOR: {
                uint32_t bgcolor;
-                ok = ParseBgcolorArgs(feature->args_[i].params_, &bgcolor);
+                ok = ParseBgcolorArgs(config->args_[i].params_, &bgcolor);
                if (!ok) {
                  ERROR_GOTO1("ERROR: Could not parse the background color \n",
                              Err2);
@ -949,7 +943,7 @@ static int Process(const WebPMuxConfig* config) {
              case SUBTYPE_LOOP: {
                int parse_error = 0;
                const int loop_count =
-                    ExUtilGetInt(feature->args_[i].params_, 10, &parse_error);
+                    ExUtilGetInt(config->args_[i].params_, 10, &parse_error);
                if (loop_count < 0 || loop_count > 65535) {
                  // Note: This is only a 'necessary' condition for loop_count
                  // to be valid. The 'sufficient' conditioned in checked in
@ -965,10 +959,10 @@ static int Process(const WebPMuxConfig* config) {
              case SUBTYPE_ANMF: {
                WebPMuxFrameInfo frame;
                frame.id = WEBP_CHUNK_ANMF;
-                ok = ReadFileToWebPData(feature->args_[i].filename_,
-                                        &frame.bitstream);
+                ok = ExUtilReadFileToWebPData(config->args_[i].filename_,
+                                              &frame.bitstream);
                if (!ok) goto Err2;
-                ok = ParseFrameArgs(feature->args_[i].params_, &frame);
+                ok = ParseFrameArgs(config->args_[i].params_, &frame);
                if (!ok) {
                  WebPDataClear(&frame.bitstream);
                  ERROR_GOTO1("ERROR: Could not parse frame properties.\n",
@ -1001,13 +995,13 @@ static int Process(const WebPMuxConfig* config) {
        case FEATURE_XMP: {
          ok = CreateMux(config->input_, &mux);
          if (!ok) goto Err2;
-          ok = ReadFileToWebPData(feature->args_[0].filename_, &chunk);
+          ok = ExUtilReadFileToWebPData(config->args_[0].filename_, &chunk);
          if (!ok) goto Err2;
-          err = WebPMuxSetChunk(mux, kFourccList[feature->type_], &chunk, 1);
+          err = WebPMuxSetChunk(mux, kFourccList[config->type_], &chunk, 1);
          free((void*)chunk.bytes);
          if (err != WEBP_MUX_OK) {
            ERROR_GOTO3("ERROR (%s): Could not set the %s.\n",
-                        ErrorString(err), kDescriptions[feature->type_], Err2);
+                        ErrorString(err), kDescriptions[config->type_], Err2);
          }
          break;
        }
@ -1043,11 +1037,11 @@ static int Process(const WebPMuxConfig* config) {
        for (i = 0; i < num_frames; ++i) durations[i] = -1;

        // Parse intervals to process.
-        for (i = 0; i < feature->arg_count_; ++i) {
+        for (i = 0; i < config->arg_count_; ++i) {
          int k;
          int args[3];
          int duration, start, end;
-          const int nb_args = ExUtilGetInts(feature->args_[i].params_,
+          const int nb_args = ExUtilGetInts(config->args_[i].params_,
                                            10, 3, args);
          ok = (nb_args >= 1);
          if (!ok) goto Err3;
@ -1105,12 +1099,12 @@ static int Process(const WebPMuxConfig* config) {
    case ACTION_STRIP: {
      ok = CreateMux(config->input_, &mux);
      if (!ok) goto Err2;
-      if (feature->type_ == FEATURE_ICCP || feature->type_ == FEATURE_EXIF ||
-          feature->type_ == FEATURE_XMP) {
-        err = WebPMuxDeleteChunk(mux, kFourccList[feature->type_]);
+      if (config->type_ == FEATURE_ICCP || config->type_ == FEATURE_EXIF ||
+          config->type_ == FEATURE_XMP) {
+        err = WebPMuxDeleteChunk(mux, kFourccList[config->type_]);
        if (err != WEBP_MUX_OK) {
          ERROR_GOTO3("ERROR (%s): Could not strip the %s.\n",
-                      ErrorString(err), kDescriptions[feature->type_], Err2);
+                      ErrorString(err), kDescriptions[config->type_], Err2);
        }
      } else {
        ERROR_GOTO1("ERROR: Invalid feature for action 'strip'.\n", Err2);
@ -1140,7 +1134,7 @@ static int Process(const WebPMuxConfig* config) {
 // Main.

 int main(int argc, const char* argv[]) {
-  WebPMuxConfig config;
+  Config config;
  int ok = InitializeConfig(argc - 1, argv + 1, &config);
  if (ok) {
    ok = Process(&config);
--- a/extras/Makefile.am
+++ b/extras/Makefile.am
@ -13,7 +13,10 @@ libwebpextras_la_LDFLAGS = -lm
 libwebpextras_la_LIBADD = ../src/libwebp.la

 noinst_PROGRAMS =
-noinst_PROGRAMS += get_disto webp_quality
+noinst_PROGRAMS += webp_quality
+if WANT_DEMUX
+  noinst_PROGRAMS += get_disto
+endif
 if BUILD_VWEBP_SDL
  noinst_PROGRAMS += vwebp_sdl
 endif
@ -27,7 +30,7 @@ get_disto_LDADD += ../src/libwebp.la
 get_disto_LDADD += $(PNG_LIBS) $(JPEG_LIBS) $(TIFF_LIBS)

 webp_quality_SOURCES  = webp_quality.c
-webp_quality_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+webp_quality_CPPFLAGS = $(AM_CPPFLAGS)
 webp_quality_LDADD =
 webp_quality_LDADD += ../imageio/libimageio_util.la
 webp_quality_LDADD += libwebpextras.la
--- a/extras/get_disto.c
+++ b/extras/get_disto.c
@ -290,9 +290,10 @@ int main(int argc, const char *argv[]) {
    fprintf(stderr, "Error while computing the distortion.\n");
    goto End;
  }
-  printf("%u %.2f    %.2f %.2f %.2f %.2f\n",
+  printf("%u %.2f    %.2f %.2f %.2f %.2f [ %.2f bpp ]\n",
         (unsigned int)size1,
-         disto[4], disto[0], disto[1], disto[2], disto[3]);
+         disto[4], disto[0], disto[1], disto[2], disto[3],
+         8.f * size1 / pic1.width / pic1.height);

  if (output != NULL) {
    uint8_t* data = NULL;
--- a/extras/quality_estimate.c
+++ b/extras/quality_estimate.c
@ -73,7 +73,7 @@ int VP8EstimateQuality(const uint8_t* const data, size_t size) {
  pos += 4;
  bit_pos = pos * 8;

-  GET_BIT(2);  // color_space + clamp type
+  GET_BIT(2);  // colorspace + clamp type

  // Segment header
  if (GET_BIT(1)) {       // use_segment_
--- a/extras/webp_to_sdl.c
+++ b/extras/webp_to_sdl.c
@ -12,7 +12,7 @@
 // Author: James Zern (jzern@google.com)

 #ifdef HAVE_CONFIG_H
-#include "webp/config.h"
+#include "src/webp/config.h"
 #endif

 #if defined(WEBP_HAVE_SDL)
@ -20,7 +20,7 @@
 #include "webp_to_sdl.h"

 #include <stdio.h>
-#include "webp/decode.h"
+#include "src/webp/decode.h"

 #if defined(WEBP_HAVE_JUST_SDL_H)
 #include <SDL.h>
--- a/imageio/Makefile.am
+++ b/imageio/Makefile.am
@ -1,7 +1,9 @@
 AM_CPPFLAGS += -I$(top_builddir)/src -I$(top_srcdir)/src
 noinst_LTLIBRARIES =
 noinst_LTLIBRARIES += libimageio_util.la
-noinst_LTLIBRARIES += libimagedec.la
+if WANT_DEMUX
+  noinst_LTLIBRARIES += libimagedec.la
+endif
 noinst_LTLIBRARIES += libimageenc.la

 noinst_HEADERS =
@ -21,9 +23,10 @@ libimagedec_la_SOURCES += tiffdec.c tiffdec.h
 libimagedec_la_SOURCES += webpdec.c webpdec.h
 libimagedec_la_SOURCES += wicdec.c wicdec.h
 libimagedec_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
-libimagedec_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+libimagedec_la_CPPFLAGS += $(AM_CPPFLAGS)
+libimagedec_la_LIBADD = ../src/demux/libwebpdemux.la

 libimageenc_la_SOURCES  =
 libimageenc_la_SOURCES += image_enc.c image_enc.h
 libimageenc_la_CPPFLAGS = $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
-libimageenc_la_CPPFLAGS += $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+libimageenc_la_CPPFLAGS += $(AM_CPPFLAGS)
--- a/imageio/image_enc.c
+++ b/imageio/image_enc.c
@ -158,14 +158,8 @@ static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
 }

 int WebPWritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  png_bytep row = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
-  const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
  volatile png_structp png;
  volatile png_infop info;
-  png_uint_32 y;

  if (out_file == NULL || buffer == NULL) return 0;

@ -184,14 +178,23 @@ int WebPWritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
    return 0;
  }
  png_init_io(png, out_file);
-  png_set_IHDR(png, info, width, height, 8,
-               has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
-               PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
-               PNG_FILTER_TYPE_DEFAULT);
-  png_write_info(png, info);
-  for (y = 0; y < height; ++y) {
-    png_write_rows(png, &row, 1);
-    row += stride;
+  {
+    const uint32_t width = buffer->width;
+    const uint32_t height = buffer->height;
+    png_bytep row = buffer->u.RGBA.rgba;
+    const int stride = buffer->u.RGBA.stride;
+    const int has_alpha = WebPIsAlphaMode(buffer->colorspace);
+    uint32_t y;
+
+    png_set_IHDR(png, info, width, height, 8,
+                 has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
+                 PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
+                 PNG_FILTER_TYPE_DEFAULT);
+    png_write_info(png, info);
+    for (y = 0; y < height; ++y) {
+      png_write_rows(png, &row, 1);
+      row += stride;
+    }
  }
  png_write_end(png, info);
  png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
--- a/imageio/imageio_util.c
+++ b/imageio/imageio_util.c
@ -47,7 +47,8 @@ int ImgIoUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
  while (!feof(stdin)) {
    // We double the buffer size each time and read as much as possible.
    const size_t extra_size = (max_size == 0) ? kBlockSize : max_size;
-    void* const new_data = realloc(input, max_size + extra_size);
+    // we allocate one extra byte for the \0 terminator
+    void* const new_data = realloc(input, max_size + extra_size + 1);
    if (new_data == NULL) goto Error;
    input = (uint8_t*)new_data;
    max_size += extra_size;
@ -55,6 +56,7 @@ int ImgIoUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
    if (size < max_size) break;
  }
  if (ferror(stdin)) goto Error;
+  if (input != NULL) input[size] = '\0';  // convenient 0-terminator
  *data = input;
  *data_size = size;
  return 1;
@ -68,7 +70,7 @@ int ImgIoUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
 int ImgIoUtilReadFile(const char* const file_name,
                      const uint8_t** data, size_t* data_size) {
  int ok;
-  void* file_data;
+  uint8_t* file_data;
  size_t file_size;
  FILE* in;
  const int from_stdin = (file_name == NULL) || !strcmp(file_name, "-");
@ -87,8 +89,14 @@ int ImgIoUtilReadFile(const char* const file_name,
  fseek(in, 0, SEEK_END);
  file_size = ftell(in);
  fseek(in, 0, SEEK_SET);
-  file_data = malloc(file_size);
-  if (file_data == NULL) return 0;
+  // we allocate one extra byte for the \0 terminator
+  file_data = (uint8_t*)malloc(file_size + 1);
+  if (file_data == NULL) {
+    fclose(in);
+    fprintf(stderr, "memory allocation failure when reading file %s\n",
+            file_name);
+    return 0;
+  }
  ok = (fread(file_data, file_size, 1, in) == 1);
  fclose(in);

@ -98,11 +106,14 @@ int ImgIoUtilReadFile(const char* const file_name,
    free(file_data);
    return 0;
  }
-  *data = (uint8_t*)file_data;
+  file_data[file_size] = '\0';  // convenient 0-terminator
+  *data = file_data;
  *data_size = file_size;
  return 1;
 }

+// -----------------------------------------------------------------------------
+
 int ImgIoUtilWriteFile(const char* const file_name,
                       const uint8_t* data, size_t data_size) {
  int ok;
--- a/imageio/imageio_util.h
+++ b/imageio/imageio_util.h
@ -30,6 +30,9 @@ FILE* ImgIoUtilSetBinaryMode(FILE* file);
 // Allocates storage for entire file 'file_name' and returns contents and size
 // in 'data' and 'data_size'. Returns 1 on success, 0 otherwise. '*data' should
 // be deleted using free().
+// Note: for convenience, the data will be null-terminated with an extra byte
+// (not accounted for in *data_size), in case the file is text and intended
+// to be used as a C-string.
 // If 'file_name' is NULL or equal to "-", input is read from stdin by calling
 // the function ImgIoUtilReadFromStdin().
 int ImgIoUtilReadFile(const char* const file_name,
--- a/imageio/pngdec.c
+++ b/imageio/pngdec.c
@ -185,7 +185,6 @@ static int ExtractMetadataFromPNG(png_structp png,
      }
    }
  }
-
  return 1;
 }

@ -265,6 +264,16 @@ int ReadPNG(const uint8_t* const data, size_t data_size,
    has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
  }

+  // Apply gamma correction if needed.
+  {
+    double image_gamma = 1 / 2.2, screen_gamma = 2.2;
+    int srgb_intent;
+    if (png_get_sRGB(png, info, &srgb_intent) ||
+        png_get_gAMA(png, info, &image_gamma)) {
+      png_set_gamma(png, screen_gamma, image_gamma);
+    }
+  }
+
  if (!keep_alpha) {
    png_set_strip_alpha(png);
    has_alpha = 0;
--- a/imageio/pnmdec.c
+++ b/imageio/pnmdec.c
@ -117,7 +117,7 @@ static size_t ReadPAMFields(PNMInfo* const info, size_t off) {
    }
  }
  if (!(info->seen_flags & TUPLE_FLAG)) {
-    if (info->depth > 0 && info->depth <= 4) {
+    if (info->depth > 0 && info->depth <= 4 && info->depth != 2) {
      info->seen_flags |= TUPLE_FLAG;
      info->bytes_per_px = info->depth * (info->max_value > 255 ? 2 : 1);
    } else {
@ -165,7 +165,7 @@ static size_t ReadHeader(PNMInfo* const info) {
  // perform some basic numerical validation
  if (info->width <= 0 || info->height <= 0 ||
      info->type <= 0 || info->type >= 9 ||
-      info->depth <= 0 || info->depth > 4 ||
+      info->depth <= 0 || info->depth == 2 || info->depth > 4 ||
      info->bytes_per_px < info->depth ||
      info->max_value <= 0 || info->max_value >= 65536) {
    return 0;
--- a/imageio/webpdec.c
+++ b/imageio/webpdec.c
@ -15,10 +15,12 @@

 #include "./webpdec.h"

+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include "webp/decode.h"
+#include "webp/demux.h"
 #include "webp/encode.h"
 #include "./imageio_util.h"
 #include "./metadata.h"
@ -95,25 +97,47 @@ VP8StatusCode DecodeWebPIncremental(
      fprintf(stderr, "Failed during WebPINewDecoder().\n");
      return VP8_STATUS_OUT_OF_MEMORY;
    } else {
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      size_t size = 0;
-      const size_t incr = 2 + (data_size / 20);
-      while (size < data_size) {
-        size_t next_size = size + (rand() % incr);
-        if (next_size > data_size) next_size = data_size;
-        status = WebPIUpdate(idec, data, next_size);
-        if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) break;
-        size = next_size;
-      }
-#else
      status = WebPIUpdate(idec, data, data_size);
-#endif
      WebPIDelete(idec);
    }
  }
  return status;
 }

+// -----------------------------------------------------------------------------
+// Metadata
+
+static int ExtractMetadata(const uint8_t* const data, size_t data_size,
+                           Metadata* const metadata) {
+  WebPData webp_data = { data, data_size };
+  WebPDemuxer* const demux = WebPDemux(&webp_data);
+  WebPChunkIterator chunk_iter;
+  uint32_t flags;
+
+  if (demux == NULL) return 0;
+  assert(metadata != NULL);
+
+  flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+  if ((flags & ICCP_FLAG) && WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter)) {
+    MetadataCopy((const char*)chunk_iter.chunk.bytes, chunk_iter.chunk.size,
+                 &metadata->iccp);
+    WebPDemuxReleaseChunkIterator(&chunk_iter);
+  }
+  if ((flags & EXIF_FLAG) && WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter)) {
+    MetadataCopy((const char*)chunk_iter.chunk.bytes, chunk_iter.chunk.size,
+                 &metadata->exif);
+    WebPDemuxReleaseChunkIterator(&chunk_iter);
+  }
+  if ((flags & XMP_FLAG) && WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter)) {
+    MetadataCopy((const char*)chunk_iter.chunk.bytes, chunk_iter.chunk.size,
+                 &metadata->xmp);
+    WebPDemuxReleaseChunkIterator(&chunk_iter);
+  }
+  WebPDemuxDelete(demux);
+  return 1;
+}
+
 // -----------------------------------------------------------------------------

 int ReadWebP(const uint8_t* const data, size_t data_size,
@ -127,11 +151,6 @@ int ReadWebP(const uint8_t* const data, size_t data_size,

  if (data == NULL || data_size == 0 || pic == NULL) return 0;

-  // TODO(jzern): add Exif/XMP/ICC extraction.
-  if (metadata != NULL) {
-    fprintf(stderr, "Warning: metadata extraction from WebP is unsupported.\n");
-  }
-
  if (!WebPInitDecoderConfig(&config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return 0;
@ -193,7 +212,6 @@ int ReadWebP(const uint8_t* const data, size_t data_size,

    status = DecodeWebP(data, data_size, &config);
    ok = (status == VP8_STATUS_OK);
-    if (!ok) WebPPictureFree(pic);
    if (ok && !keep_alpha && pic->use_argb) {
      // Need to wipe out the alpha value, as requested.
      int x, y;
@ -207,9 +225,18 @@ int ReadWebP(const uint8_t* const data, size_t data_size,

  if (status != VP8_STATUS_OK) {
    PrintWebPError("input data", status);
+    ok = 0;
  }

  WebPFreeDecBuffer(output_buffer);
+
+  if (ok && metadata != NULL) {
+    ok = ExtractMetadata(data, data_size, metadata);
+    if (!ok) {
+      PrintWebPError("metadata", VP8_STATUS_BITSTREAM_ERROR);
+    }
+  }
+  if (!ok) WebPPictureFree(pic);
  return ok;
 }

--- a/makefile.unix
+++ b/makefile.unix
@ -25,6 +25,7 @@ ifeq ($(strip $(shell uname)), Darwin)
  # Failure observed with: gcc 4.2.1 and 4.0.1.
  EXTRA_FLAGS += -fno-common
  EXTRA_FLAGS += -DHAVE_GLUT_GLUT_H
+  EXTRA_FLAGS += -Wno-deprecated-declarations
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
  GL_LIBS = -framework GLUT -framework OpenGL
@ -63,9 +64,6 @@ endif
 # 'make -f makefile.unix EXTRA_FLAGS=-m32' to that effect.
 # EXTRA_FLAGS += -m32

-# Extra flags to enable experimental features and code
-# EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES
-
 # Extra flags to enable byte swap for 16 bit colorspaces.
 # EXTRA_FLAGS += -DWEBP_SWAP_16BIT_CSP=1

@ -176,11 +174,13 @@ DSP_DEC_OBJS = \
    src/dsp/upsampling_msa.o \
    src/dsp/upsampling_neon.o \
    src/dsp/upsampling_sse2.o \
+    src/dsp/upsampling_sse41.o \
    src/dsp/yuv.o \
    src/dsp/yuv_mips32.o \
    src/dsp/yuv_mips_dsp_r2.o \
    src/dsp/yuv_neon.o \
    src/dsp/yuv_sse2.o \
+    src/dsp/yuv_sse41.o \

 DSP_ENC_OBJS = \
    src/dsp/cost.o \
@ -212,7 +212,6 @@ ENC_OBJS = \
    src/enc/backward_references_enc.o \
    src/enc/config_enc.o \
    src/enc/cost_enc.o \
-    src/enc/delta_palettization_enc.o \
    src/enc/filter_enc.o \
    src/enc/frame_enc.o \
    src/enc/histogram_enc.o \
@ -310,7 +309,6 @@ HDRS = \
    src/dsp/yuv.h \
    src/enc/backward_references_enc.h \
    src/enc/cost_enc.h \
-    src/enc/delta_palettization_enc.h \
    src/enc/histogram_enc.h \
    src/enc/vp8i_enc.h \
    src/enc/vp8li_enc.h \
@ -412,11 +410,13 @@ examples/anim_dump: src/libwebp.a
 examples/anim_dump: EXTRA_LIBS += $(GIF_LIBS) $(DWEBP_LIBS)
 examples/cwebp: examples/libexample_util.a
 examples/cwebp: imageio/libimagedec.a
+examples/cwebp: src/demux/libwebpdemux.a
 examples/cwebp: imageio/libimageio_util.a
 examples/cwebp: src/libwebp.a
 examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/dwebp: examples/libexample_util.a
 examples/dwebp: imageio/libimagedec.a
+examples/dwebp: src/demux/libwebpdemux.a
 examples/dwebp: imageio/libimageenc.a
 examples/dwebp: imageio/libimageio_util.a
 examples/dwebp: src/libwebp.a
@ -433,13 +433,17 @@ examples/webpmux: examples/libexample_util.a imageio/libimageio_util.a
 examples/webpmux: src/mux/libwebpmux.a src/libwebpdecoder.a
 examples/img2webp: examples/libexample_util.a imageio/libimageio_util.a
 examples/img2webp: imageio/libimagedec.a
+examples/img2webp: src/demux/libwebpdemux.a
 examples/img2webp: src/mux/libwebpmux.a src/libwebp.a
 examples/img2webp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/webpinfo: examples/libexample_util.a imageio/libimageio_util.a
 examples/webpinfo: src/libwebpdecoder.a

 extras/get_disto: extras/get_disto.o
-extras/get_disto: imageio/libimagedec.a imageio/libimageio_util.a src/libwebp.a
+extras/get_disto: imageio/libimagedec.a
+extras/get_disto: src/demux/libwebpdemux.a
+extras/get_disto: imageio/libimageio_util.a
+extras/get_disto: src/libwebp.a
 extras/get_disto: EXTRA_LIBS += $(CWEBP_LIBS)

 extras/webp_quality: extras/webp_quality.o
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "September 20, 2017"
+.TH GIF2WEBP 1 "January 25, 2018"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -20,6 +20,12 @@ Specify the name of the output WebP file. If omitted, \fBgif2webp\fP will
 perform conversion but only report statistics.
 Using "\-" as output name will direct output to 'stdout'.
 .TP
+.BI \-\- " string
+Explicitly specify the input file. This option is useful if the input
+file starts with an '\-' for instance. This option must appear \fBlast\fP.
+Any other options afterward will be ignored. If the input file is "\-",
+the data will be read from \fIstdin\fP instead of a file.
+.TP
 .B \-h, \-help
 Usage information.
 .TP
@ -109,6 +115,7 @@ the range of 20 to 50.
 .TP
 .B \-mt
 Use multi-threading for encoding, if possible.
+.TP
 .B \-loop_compatibility
 If enabled, handle the loop information in a compatible fashion for Chrome
 version prior to M62 (inclusive) and Firefox.
@ -136,6 +143,8 @@ gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
 gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
 .br
 gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif
+.br
+cat picture.gif | gif2webp \-o \- \-\- \- > output.webp

 .SH AUTHORS
 \fBgif2webp\fP is a part of libwebp and was written by the WebP team.
--- a/man/img2webp.1
+++ b/man/img2webp.1
@ -1,11 +1,13 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH IMG2WEBP 1 "January 23, 2017"
+.TH IMG2WEBP 1 "February 7, 2018"
 .SH NAME
 img2webp \- create animated WebP file from a sequence of input images.
 .SH SYNOPSIS
 .B img2webp
 [file_level_options] [files] [per_frame_options...]
 .br
+.B img2webp argument_file_name
+.br
 .SH DESCRIPTION
 This manual page documents the
 .B img2webp
@ -13,6 +15,9 @@ command.
 .PP
 \fBimg2webp\fP compresses a sequence of images using the animated WebP format.
 Input images can either be PNG, JPEG, TIFF or WebP.
+If a single file name (not starting with the character '\-') is supplied as
+the argument, the command line argument are actually tokenized from this file.
+This allows for easy scripting or using large number of arguments.
 .SH FILE-LEVEL OPTIONS
 The file-level options are applied at the beginning of the compression process,
 before the input frames are read.
@ -40,8 +45,8 @@ lossy or lossless compression for each frame heuristically. This global
 option disables the local option \fB-lossy\fP and \fB-lossless\fP .
 .TP
 .BI \-loop " int
-Specifies the number of times the animation should loop. Using '0' means
-'loop indefinitely'.
+Specifies the number of times the animation should loop. Using '0'
+means 'loop indefinitely'.
 .TP
 .BI \-v
 Be more verbose.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "November 10, 2016"
+.TH WEBPMUX 1 "December 1, 2017"
 .SH NAME
 webpmux \- create animated WebP files from non\-animated WebP images, extract
 frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
@ -48,6 +48,8 @@ frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
 .B webpmux [\-h|\-help]
 .br
 .B webpmux \-version
+.br
+.B webpmux argument_file_name
 .SH DESCRIPTION
 This manual page documents the
 .B webpmux
@ -55,6 +57,9 @@ command.
 .PP
 \fBwebpmux\fP can be used to create/extract from animated WebP files, as well as
 to add/extract/strip XMP/EXIF metadata and ICC profile.
+If a single file name (not starting with the character '\-') is supplied as
+the argument, the command line argument are actually tokenized from this file.
+This allows for easy scripting or using large number of arguments.
 .SH OPTIONS
 .SS GET_OPTIONS (\-get):
 .TP
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@ -25,5 +25,5 @@ libwebpdecodeinclude_HEADERS += ../webp/types.h
 noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

-libwebpdecode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+libwebpdecode_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpdecodeincludedir = $(includedir)/webp
--- a/src/dec/frame_dec.c
+++ b/src/dec/frame_dec.c
@ -400,7 +400,9 @@ static void DitherRow(VP8Decoder* const dec) {
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB

 // Finalize and transmit a complete row. Return false in case of user-abort.
-static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+static int FinishRow(void* arg1, void* arg2) {
+  VP8Decoder* const dec = (VP8Decoder*)arg1;
+  VP8Io* const io = (VP8Io*)arg2;
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
@ -448,10 +450,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    if (y_end > io->crop_bottom) {
      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    }
+    // If dec->alpha_data_ is not NULL, we have some alpha plane present.
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // good idea.
      io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@ -558,7 +559,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  if (io->bypass_filtering) {
    dec->filter_type_ = 0;
  }
-  // TODO(skal): filter type / strength / sharpness forcing

  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
@ -569,8 +569,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Means: there's a dependency chain that goes all the way up to the
  // top-left corner of the picture (MB #0). We must filter all the previous
  // macroblocks.
-  // TODO(skal): add an 'approximate_decoding' option, that won't produce
-  // a 1:1 bit-exactness for complex filtering?
  {
    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
    if (dec->filter_type_ == 2) {
@ -651,7 +649,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
    }
    worker->data1 = dec;
    worker->data2 = (void*)&dec->thread_ctx_.io_;
-    worker->hook = (WebPWorkerHook)FinishRow;
+    worker->hook = FinishRow;
    dec->num_caches_ =
      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
  } else {
--- a/src/dec/vp8l_dec.c
+++ b/src/dec/vp8l_dec.c
@ -1643,17 +1643,17 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

 #if !defined(WEBP_REDUCE_SIZE)
    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
-
-    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
-      // need the alpha-multiply functions for premultiplied output or rescaling
-      WebPInitAlphaProcessing();
-    }
 #else
    if (io->use_scaling) {
      dec->status_ = VP8_STATUS_INVALID_PARAM;
      goto Err;
    }
 #endif
+    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+      // need the alpha-multiply functions for premultiplied output or rescaling
+      WebPInitAlphaProcessing();
+    }
+
    if (!WebPIsRGBMode(dec->output_->colorspace)) {
      WebPInitConvertARGBToYUV();
      if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -65,6 +65,8 @@ libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)
 libwebpdspdecode_sse41_la_SOURCES =
 libwebpdspdecode_sse41_la_SOURCES += alpha_processing_sse41.c
 libwebpdspdecode_sse41_la_SOURCES += dec_sse41.c
+libwebpdspdecode_sse41_la_SOURCES += upsampling_sse41.c
+libwebpdspdecode_sse41_la_SOURCES += yuv_sse41.c
 libwebpdspdecode_sse41_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdspdecode_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_FLAGS)

@ -139,7 +141,7 @@ noinst_HEADERS += ../webp/decode.h

 libwebpdsp_la_CPPFLAGS =
 libwebpdsp_la_CPPFLAGS += $(AM_CPPFLAGS)
-libwebpdsp_la_CPPFLAGS += $(USE_EXPERIMENTAL_CODE) $(USE_SWAP_16BIT_CSP)
+libwebpdsp_la_CPPFLAGS += $(USE_SWAP_16BIT_CSP)
 libwebpdsp_la_LDFLAGS = -lm
 libwebpdsp_la_LIBADD =
 libwebpdsp_la_LIBADD += libwebpdsp_avx2.la
--- a/src/dsp/common_sse2.h
+++ b/src/dsp/common_sse2.h
@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
-                                       __m128i* const in2, __m128i* const in3,
-                                       __m128i* const in4, __m128i* const in5) {
+static WEBP_INLINE void VP8PlanarTo24b_SSE2(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
  // The input is 6 registers of sixteen 8b but for the sake of explanation,
  // let's take 6 registers of four 8b values.
  // To pack, we will keep taking one every two 8b integer and move it
@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,

 // Convert four packed four-channel buffers like argbargbargbargb... into the
 // split channels aaaaa ... rrrr ... gggg .... bbbbb ......
-static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
-                                        __m128i* const in1,
-                                        __m128i* const in2,
-                                        __m128i* const in3) {
+static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
+                                             __m128i* const in1,
+                                             __m128i* const in2,
+                                             __m128i* const in3) {
  // Column-wise transpose.
  const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
  const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
--- a/src/dsp/common_sse41.h
+++ b/src/dsp/common_sse41.h
@ -0,0 +1,132 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 code common to several files.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_COMMON_SSE41_H_
+#define WEBP_DSP_COMMON_SSE41_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+// Channel mixing.
+// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ...
+#define WEBP_SSE41_SHUFF(OUT, IN0, IN1)    \
+  OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \
+  OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \
+  OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \
+  OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \
+  OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \
+  OUT##5 = _mm_shuffle_epi8(*IN1, shuff2);
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void VP8PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
+  __m128i R0, R1, R2, R3, R4, R5;
+  __m128i G0, G1, G2, G3, G4, G5;
+  __m128i B0, B1, B2, B3, B4, B5;
+
+  // Process R.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1);
+    WEBP_SSE41_SHUFF(R, in0, in1)
+  }
+
+  // Process G.
+  {
+    // Same as before, just shifted to the left by one and including the right
+    // padding.
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1);
+    WEBP_SSE41_SHUFF(G, in2, in3)
+  }
+
+  // Process B.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+      15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10);
+    WEBP_SSE41_SHUFF(B, in4, in5)
+  }
+
+  // OR the different channels.
+  {
+    const __m128i RG0 = _mm_or_si128(R0, G0);
+    const __m128i RG1 = _mm_or_si128(R1, G1);
+    const __m128i RG2 = _mm_or_si128(R2, G2);
+    const __m128i RG3 = _mm_or_si128(R3, G3);
+    const __m128i RG4 = _mm_or_si128(R4, G4);
+    const __m128i RG5 = _mm_or_si128(R5, G5);
+    *in0 = _mm_or_si128(RG0, B0);
+    *in1 = _mm_or_si128(RG1, B1);
+    *in2 = _mm_or_si128(RG2, B2);
+    *in3 = _mm_or_si128(RG3, B3);
+    *in4 = _mm_or_si128(RG4, B4);
+    *in5 = _mm_or_si128(RG5, B5);
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert four packed four-channel buffers like argbargbargbargb... into the
+// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
+static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0,
+                                              __m128i* const in1,
+                                              __m128i* const in2,
+                                              __m128i* const in3) {
+  // aaaarrrrggggbbbb
+  const __m128i shuff0 =
+      _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0);
+  const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0);
+  // A0A1R0R1
+  // G0G1B0B1
+  // A2A3R2R3
+  // G0G1B0B1
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi32(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_unpackhi_epi32(A2, A3);
+  *in3 = _mm_unpacklo_epi64(B0, B2);
+  *in2 = _mm_unpackhi_epi64(B0, B2);
+  *in1 = _mm_unpacklo_epi64(B1, B3);
+  *in0 = _mm_unpackhi_epi64(B1, B3);
+}
+
+#endif  // WEBP_USE_SSE41
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_DSP_COMMON_SSE41_H_
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -25,10 +25,6 @@
 extern "C" {
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/enc/delta_palettization_enc.h"
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 //------------------------------------------------------------------------------
 // Decoding

--- a/src/dsp/lossless_enc_sse2.c
+++ b/src/dsp/lossless_enc_sse2.c
@ -46,16 +46,14 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform

+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
 static void TransformColor_SSE2(const VP8LMultipliers* const m,
                                uint32_t* argb_data, int num_pixels) {
-  const __m128i mults_rb = _mm_set_epi16(
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
+  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+                                     CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
  int i;
@ -85,12 +83,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
                                            int tile_width, int tile_height,
                                            int green_to_blue, int red_to_blue,
                                            int histo[]) {
-  const __m128i mults_r = _mm_set_epi16(
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
+  const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
  int y;
@ -135,9 +129,7 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
 static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
                                           int tile_width, int tile_height,
                                           int green_to_red, int histo[]) {
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask = _mm_set1_epi32(0xff);

@ -174,6 +166,7 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
  }
 }
 #undef SPAN
+#undef MK_CST_16

 //------------------------------------------------------------------------------

--- a/src/dsp/lossless_enc_sse41.c
+++ b/src/dsp/lossless_enc_sse41.c
@ -18,6 +18,9 @@
 #include <smmintrin.h>
 #include "src/dsp/lossless.h"

+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform

@ -38,6 +41,95 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
  }
 }

+//------------------------------------------------------------------------------
+// Color Transform
+
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+                                             int tile_width, int tile_height,
+                                             int green_to_blue, int red_to_blue,
+                                             int histo[]) {
+  const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi16(0xff00);   // green mask
+  const __m128i mask_gb = _mm_set1_epi32(0xffff);  // green/blue mask
+  const __m128i mask_b = _mm_set1_epi16(0x00ff);   // blue mask
+  const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
+                                            -1, -1, -1, -1, -1, -1, -1);
+  const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                            2, -1, 6, -1, 10, -1, 14);
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
+      const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
+      const __m128i r = _mm_or_si128(r0, r1);         // r 0
+      const __m128i gb0 = _mm_and_si128(in0, mask_gb);
+      const __m128i gb1 = _mm_and_si128(in1, mask_gb);
+      const __m128i gb = _mm_packus_epi32(gb0, gb1);  // g b
+      const __m128i g = _mm_and_si128(gb, mask_g);    // g 0
+      const __m128i A = _mm_mulhi_epi16(r, mults_r);  // x dbr
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dbg
+      const __m128i C = _mm_sub_epi8(gb, B);          // x b'
+      const __m128i D = _mm_sub_epi8(C, A);           // x b''
+      const __m128i E = _mm_and_si128(D, mask_b);     // 0 b''
+      _mm_storeu_si128((__m128i*)values, E);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_red, int histo[]) {
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi16(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i g0 = _mm_and_si128(in0, mask_g);  // 0 0  | g 0
+      const __m128i g1 = _mm_and_si128(in1, mask_g);
+      const __m128i g = _mm_packus_epi32(g0, g1);     // g 0
+      const __m128i A0 = _mm_srli_epi32(in0, 16);     // 0 0  | x r
+      const __m128i A1 = _mm_srli_epi32(in1, 16);
+      const __m128i A = _mm_packus_epi32(A0, A1);     // x r
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dr
+      const __m128i C = _mm_sub_epi8(A, B);           // x r'
+      const __m128i D = _mm_and_si128(C, mask);       // 0 r'
+      _mm_storeu_si128((__m128i*)values, D);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height, green_to_red,
+                                      histo);
+    }
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point

@ -45,6 +137,8 @@ extern void VP8LEncDspInitSSE41(void);

 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
 }

 #else  // !WEBP_USE_SSE41
--- a/src/dsp/lossless_sse2.c
+++ b/src/dsp/lossless_sse2.c
@ -453,14 +453,11 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
-  const __m128i mults_rb = _mm_set_epi16(
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0,
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0);
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+  const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
+#undef MK_CST_16
 #undef CST
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  int i;
@ -503,11 +500,11 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
    __m128i in5 = _mm_loadu_si128(in + 5);
    __m128i in6 = _mm_loadu_si128(in + 6);
    __m128i in7 = _mm_loadu_si128(in + 7);
-    VP8L32bToPlanar(&in0, &in1, &in2, &in3);
-    VP8L32bToPlanar(&in4, &in5, &in6, &in7);
+    VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
+    VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
    // At this points, in1/in5 contains red only, in2/in6 green only ...
    // Pack the colors in 24b RGB.
-    VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
+    VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
    _mm_storeu_si128(out + 0, in1);
    _mm_storeu_si128(out + 1, in5);
    _mm_storeu_si128(out + 2, in2);
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -217,6 +217,7 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST];

 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 extern void WebPInitYUV444ConvertersSSE2(void);
+extern void WebPInitYUV444ConvertersSSE41(void);

 static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
@ -242,6 +243,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
      WebPInitYUV444ConvertersSSE2();
    }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitYUV444ConvertersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitYUV444ConvertersMIPSdspR2();
@ -255,6 +261,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
 // Main calls

 extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersSSE41(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
 extern void WebPInitUpsamplersMSA(void);
@ -287,6 +294,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
      WebPInitUpsamplersSSE2();
    }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitUpsamplersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitUpsamplersMIPSdspR2();
@ -310,6 +322,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  assert(WebPUpsamplers[MODE_BGRA] != NULL);
  assert(WebPUpsamplers[MODE_rgbA] != NULL);
  assert(WebPUpsamplers[MODE_bgrA] != NULL);
+#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE
  assert(WebPUpsamplers[MODE_RGB] != NULL);
  assert(WebPUpsamplers[MODE_BGR] != NULL);
  assert(WebPUpsamplers[MODE_ARGB] != NULL);
@ -317,6 +330,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  assert(WebPUpsamplers[MODE_RGB_565] != NULL);
  assert(WebPUpsamplers[MODE_Argb] != NULL);
  assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+#endif

 #endif  // FANCY_UPSAMPLING
  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
--- a/src/dsp/upsampling_msa.c
+++ b/src/dsp/upsampling_msa.c
@ -264,6 +264,7 @@ static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
  bgr[2] = Clip8(r1 >> 6);
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
  const int y1 = MultHi(y, 19077);
  const int r1 = y1 + MultHi(v, 26149) - 14234;
@ -306,6 +307,7 @@ static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
  argb[0] = 0xff;
  YuvToRgb(y, u, v, argb + 1);
 }
+#endif  // WEBP_REDUCE_CSP

 static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
  YuvToBgr(y, u, v, bgra);
@ -317,6 +319,7 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
  rgba[3] = 0xff;
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
                         const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
@ -370,6 +373,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
    memcpy(dst, temp, length * 3 * sizeof(*dst));
  }
 }
+#endif  // WEBP_REDUCE_CSP

 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
@ -427,6 +431,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
  }
 }

+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
  v16u8 R, G, B;
@ -526,6 +531,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
    memcpy(dst, temp, length * 2 * sizeof(*dst));
  }
 }
+#endif  // WEBP_REDUCE_CSP

 #define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
  v16u8 s = __msa_aver_u_b(a, d);             \
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@ -104,21 +104,6 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
  Upsample32Pixels_SSE2(r1, r2, out);                                          \
 }

-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
-                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
-  int n;                                                                       \
-  for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n],                                 \
-         (top_dst) + ((cur_x) + n) * (XSTEP));                                 \
-  }                                                                            \
-  if ((bottom_y) != NULL) {                                                    \
-    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n],                  \
-           (bottom_dst) + ((cur_x) + n) * (XSTEP));                            \
-    }                                                                          \
-  }                                                                            \
-}
-
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                       top_dst, bottom_dst, cur_x) do {                        \
  FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));   \
@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
  int uv_pos, pos;                                                             \
  /* 16byte-aligned array to cache reconstructed u and v */                    \
-  uint8_t uv_buf[4 * 32 + 15];                                                 \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
  uint8_t* const r_v = r_u + 32;                                               \
                                                                               \
@ -160,11 +145,22 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
  }                                                                            \
  if (len > 1) {                                                               \
    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
    assert(left_over > 0);                                                     \
    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
-                pos, len - pos);                                               \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
  }                                                                            \
 }

--- a/src/dsp/upsampling_sse41.c
+++ b/src/dsp/upsampling_sse41.c
@ -0,0 +1,239 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 version of YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <assert.h>
+#include <smmintrin.h>
+#include <string.h>
+#include "src/dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#if !defined(WEBP_REDUCE_CSP)
+
+// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+// u = (9*a + 3*b + 3*c + d + 8) / 16
+//   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+//   = (a + m + 1) / 2
+// where m = (a + 3*b + 3*c + d) / 8
+//         = ((a + b + c + d) / 2 + b + c) / 4
+//
+// Let's say  k = (a + b + c + d) / 4.
+// We can compute k as
+// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+//
+// Then m can be written as
+// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+
+// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+#define GET_M(ij, in, out) do {                                                \
+  const __m128i tmp0 = _mm_avg_epu8(k, (in));     /* (k + in + 1) / 2 */       \
+  const __m128i tmp1 = _mm_and_si128((ij), st);   /* (ij) & (s^t) */           \
+  const __m128i tmp2 = _mm_xor_si128(k, (in));    /* (k^in) */                 \
+  const __m128i tmp3 = _mm_or_si128(tmp1, tmp2);  /* ((ij) & (s^t)) | (k^in) */\
+  const __m128i tmp4 = _mm_and_si128(tmp3, one);  /* & 1 -> lsb_correction */  \
+  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
+} while (0)
+
+// pack and store two alternating pixel rows
+#define PACK_AND_STORE(a, b, da, db, out) do {                                 \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
+} while (0)
+
+// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+  const __m128i one = _mm_set1_epi8(1);                                        \
+  const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
+  const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
+  const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]);                 \
+  const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]);                 \
+                                                                               \
+  const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
+  const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
+  const __m128i st = _mm_xor_si128(s, t);      /* st = s^t */                  \
+                                                                               \
+  const __m128i ad = _mm_xor_si128(a, d);      /* ad = a^d */                  \
+  const __m128i bc = _mm_xor_si128(b, c);      /* bc = b^c */                  \
+                                                                               \
+  const __m128i t1 = _mm_or_si128(ad, bc);     /* (a^d) | (b^c) */             \
+  const __m128i t2 = _mm_or_si128(t1, st);     /* (a^d) | (b^c) | (s^t) */     \
+  const __m128i t3 = _mm_and_si128(t2, one);   /* (a^d) | (b^c) | (s^t) & 1 */ \
+  const __m128i t4 = _mm_avg_epu8(s, t);                                       \
+  const __m128i k = _mm_sub_epi8(t4, t3);      /* k = (a + b + c + d) / 4 */   \
+  __m128i diag1, diag2;                                                        \
+                                                                               \
+  GET_M(bc, t, diag1);                  /* diag1 = (a + 3b + 3c + d) / 8 */    \
+  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
+                                                                               \
+  /* pack the alternate pixels */                                              \
+  PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
+  PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
+                                  uint8_t* const out) {
+  UPSAMPLE_32PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                         \
+  uint8_t r1[17], r2[17];                                                      \
+  memcpy(r1, (tb), (num_pixels));                                              \
+  memcpy(r2, (bb), (num_pixels));                                              \
+  /* replicate last byte */                                                    \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
+  /* using the shared function instead of the macro saves ~3k code size */     \
+  Upsample32Pixels_SSE41(r1, r2, out);                                         \
+}
+
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));  \
+  if ((bottom_y) != NULL) {                                                    \
+    FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64,                   \
+                  (bottom_dst) + (cur_x) * (XSTEP));                           \
+  }                                                                            \
+} while (0)
+
+#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
+                                                                               \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
+  }                                                                            \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  }                                                                            \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
+  }                                                                            \
+}
+
+// SSE4 variants of the fancy upsampler.
+SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41,  VP8YuvToRgb,  3)
+SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41,  VP8YuvToBgr,  3)
+
+#undef GET_M
+#undef PACK_AND_STORE
+#undef UPSAMPLE_32PIXELS
+#undef UPSAMPLE_LAST_BLOCK
+#undef CONVERT2RGB
+#undef CONVERT2RGB_32
+#undef SSE4_UPSAMPLE_FUNC
+
+#endif   // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_SSE41;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern void WebPInitYUV444ConvertersSSE41(void);
+
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  const int max_len = len & ~31;                                               \
+  for (i = 0; i < max_len; i += 32) {                                          \
+    CALL(y + i, u + i, v + i, dst + i * (XSTEP));                              \
+  }                                                                            \
+  if (i < len) {  /* C-fallback */                                             \
+    CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i);                   \
+  }                                                                            \
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
+#endif  // WEBP_REDUCE_CSP
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE41;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#else
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41)
+
+#endif  // WEBP_USE_SSE41
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41)
+#endif
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -71,6 +71,7 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];

 extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersSSE41(void);
 extern void WebPInitSamplersMIPS32(void);
 extern void WebPInitSamplersMIPSdspR2(void);

@ -99,6 +100,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
      WebPInitSamplersSSE2();
    }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitSamplersSSE41();
+    }
+#endif  // WEBP_USE_SSE41
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPInitSamplersMIPS32();
@ -258,6 +264,7 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;

 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
 extern void WebPInitSharpYUVNEON(void);
@ -286,6 +293,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
      WebPInitSharpYUVSSE2();
    }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitConvertARGBToYUVSSE41();
+    }
+#endif  // WEBP_USE_SSE41
  }

 #if defined(WEBP_USE_NEON)
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@ -166,6 +166,19 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,

 #endif    // WEBP_USE_SSE2

+//-----------------------------------------------------------------------------
+// SSE41 extra functions (mostly for upsampling_sse41.c)
+
+#if defined(WEBP_USE_SSE41)
+
+// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+
+#endif    // WEBP_USE_SSE41
+
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion

--- a/src/dsp/yuv_sse2.c
+++ b/src/dsp/yuv_sse2.c
@ -180,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
  // Repeat the same permutations twice more:
  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
-  VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
+  VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);

  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
@ -492,7 +492,7 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
  __m128i a1 = LOAD_16(argb + 4);
  __m128i a2 = LOAD_16(argb + 8);
  __m128i a3 = LOAD_16(argb + 12);
-  VP8L32bToPlanar(&a0, &a1, &a2, &a3);
+  VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
  rgb[0] = _mm_unpacklo_epi8(a1, zero);
  rgb[1] = _mm_unpackhi_epi8(a1, zero);
  rgb[2] = _mm_unpacklo_epi8(a2, zero);
--- a/src/dsp/yuv_sse41.c
+++ b/src/dsp/yuv_sse41.c
@ -0,0 +1,613 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include <stdlib.h>
+#include <smmintrin.h>
+
+//-----------------------------------------------------------------------------
+// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+// R = (19077 * y             + 26149 * v - 14234) >> 6
+// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+// B = (19077 * y + 33050 * u             - 17685) >> 6
+static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
+                                     const __m128i* const U0,
+                                     const __m128i* const V0,
+                                     __m128i* const R,
+                                     __m128i* const G,
+                                     __m128i* const B) {
+  const __m128i k19077 = _mm_set1_epi16(19077);
+  const __m128i k26149 = _mm_set1_epi16(26149);
+  const __m128i k14234 = _mm_set1_epi16(14234);
+  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+  const __m128i k33050 = _mm_set1_epi16((short)33050);
+  const __m128i k17685 = _mm_set1_epi16(17685);
+  const __m128i k6419  = _mm_set1_epi16(6419);
+  const __m128i k13320 = _mm_set1_epi16(13320);
+  const __m128i k8708  = _mm_set1_epi16(8708);
+
+  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
+
+  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
+  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
+  const __m128i R2 = _mm_add_epi16(R1, R0);
+
+  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
+  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
+  const __m128i G2 = _mm_add_epi16(Y1, k8708);
+  const __m128i G3 = _mm_add_epi16(G0, G1);
+  const __m128i G4 = _mm_sub_epi16(G2, G3);
+
+  // be careful with the saturated *unsigned* arithmetic here!
+  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
+  const __m128i B1 = _mm_adds_epu16(B0, Y1);
+  const __m128i B2 = _mm_subs_epu16(B1, k17685);
+
+  // use logical shift for B2, which can be larger than 32767
+  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
+  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
+  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
+}
+
+// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
+}
+
+// Load and replicate the U/V samples
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
+  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
+}
+
+// Convert 32 samples of YUV444 to R/G/B
+static void YUV444ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
+                V0 = Load_HI_16_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Convert 32 samples of YUV420 to R/G/B
+static void YUV420ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
+                V0 = Load_UV_HI_8_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5,
+    uint8_t* const rgb) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
+  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
+  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
+  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
+  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
+  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
+}
+
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+  YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as RRRRGGGGBBBB.
+  rgb0 = _mm_packus_epi16(R0, R1);
+  rgb1 = _mm_packus_epi16(R2, R3);
+  rgb2 = _mm_packus_epi16(G0, G1);
+  rgb3 = _mm_packus_epi16(G2, G3);
+  rgb4 = _mm_packus_epi16(B0, B1);
+  rgb5 = _mm_packus_epi16(B2, B3);
+
+  // Pack as RGBRGBRGBRGB.
+  PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+}
+
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+  YUV444ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as BBBBGGGGRRRR.
+  bgr0 = _mm_packus_epi16(B0, B1);
+  bgr1 = _mm_packus_epi16(B2, B3);
+  bgr2 = _mm_packus_epi16(G0, G1);
+  bgr3 = _mm_packus_epi16(G2, G3);
+  bgr4 = _mm_packus_epi16(R0, R1);
+  bgr5= _mm_packus_epi16(R2, R3);
+
+  // Pack as BGRBGRBGRBGR.
+  PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+}
+
+//-----------------------------------------------------------------------------
+// Arbitrary-length row conversion functions
+
+static void YuvToRgbRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as RRRRGGGGBBBB.
+    rgb0 = _mm_packus_epi16(R0, R1);
+    rgb1 = _mm_packus_epi16(R2, R3);
+    rgb2 = _mm_packus_epi16(G0, G1);
+    rgb3 = _mm_packus_epi16(G2, G3);
+    rgb4 = _mm_packus_epi16(B0, B1);
+    rgb5 = _mm_packus_epi16(B2, B3);
+
+    // Pack as RGBRGBRGBRGB.
+    PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToRgb(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+static void YuvToBgrRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as BBBBGGGGRRRR.
+    bgr0 = _mm_packus_epi16(B0, B1);
+    bgr1 = _mm_packus_epi16(B2, B3);
+    bgr2 = _mm_packus_epi16(G0, G1);
+    bgr3 = _mm_packus_epi16(G2, G3);
+    bgr4 = _mm_packus_epi16(R0, R1);
+    bgr5 = _mm_packus_epi16(R2, R3);
+
+    // Pack as BGRBGRBGRBGR.
+    PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToBgr(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE41;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE41;
+}
+
+//------------------------------------------------------------------------------
+// RGB24/32 -> YUV converters
+
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
+// Store either 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
+
+#define WEBP_SSE41_SHUFF(OUT)  do {                  \
+  const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
+  const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
+  const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
+  const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
+  const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
+  const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
+                                                     \
+  /* OR everything to get one channel */             \
+  const __m128i tmp6 = _mm_or_si128(tmp0, tmp1);     \
+  const __m128i tmp7 = _mm_or_si128(tmp3, tmp4);     \
+  out[OUT + 0] = _mm_or_si128(tmp6, tmp2);           \
+  out[OUT + 1] = _mm_or_si128(tmp7, tmp5);           \
+} while (0);
+
+// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// Similar to PlanarTo24bHelper(), but in reverse order.
+static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
+    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
+  const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
+  const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
+  const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
+  const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
+  const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
+
+  // Compute RR.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(0)
+  }
+  // Compute GG.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(2)
+  }
+  // Compute BB.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(4)
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert 8 packed ARGB to r[], g[], b[]
+static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
+    const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i a0 = LOAD_16(argb + 0);
+  __m128i a1 = LOAD_16(argb + 4);
+  __m128i a2 = LOAD_16(argb + 8);
+  __m128i a3 = LOAD_16(argb + 12);
+  VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
+  rgb[0] = _mm_unpacklo_epi8(a1, zero);
+  rgb[1] = _mm_unpackhi_epi8(a1, zero);
+  rgb[2] = _mm_unpacklo_epi8(a2, zero);
+  rgb[3] = _mm_unpackhi_epi8(a2, zero);
+  rgb[4] = _mm_unpacklo_epi8(a3, zero);
+  rgb[5] = _mm_unpackhi_epi8(a3, zero);
+}
+
+// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
+// It's a macro and not a function because we need to use immediate values with
+// srai_epi32, e.g.
+#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
+                  ROUNDER, DESCALE_FIX, OUT) do {               \
+  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
+  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
+  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
+  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
+  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
+  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
+  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
+  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
+  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
+  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
+  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
+} while (0)
+
+#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
+static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
+                                            const __m128i* const G,
+                                            const __m128i* const B,
+                                            __m128i* const Y) {
+  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
+  const __m128i kGB_y = MK_CST_16(16384, 6420);
+  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
+}
+
+static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
+                                             const __m128i* const G,
+                                             const __m128i* const B,
+                                             __m128i* const U,
+                                             __m128i* const V) {
+  const __m128i kRG_u = MK_CST_16(-9719, -19081);
+  const __m128i kGB_u = MK_CST_16(0, 28800);
+  const __m128i kRG_v = MK_CST_16(28800, 0);
+  const __m128i kGB_v = MK_CST_16(-24116, -4684);
+  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
+            kHALF_UV, YUV_FIX + 2, *U);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
+            kHALF_UV, YUV_FIX + 2, *V);
+}
+
+#undef MK_CST_16
+#undef TRANSFORM
+
+static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
+    __m128i rgb_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
+    __m128i bgr_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
+  const int max_width = width & ~15;
+  int i;
+  for (i = 0; i < max_width; i += 16) {
+    __m128i Y0, Y1, rgb[6];
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
+    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+// Horizontal add (doubled) of two 16b values, result is 16b.
+// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
+static void HorizontalAddPack_SSE41(const __m128i* const A,
+                                    const __m128i* const B,
+                                    __m128i* const out) {
+  const __m128i k2 = _mm_set1_epi16(2);
+  const __m128i C = _mm_madd_epi16(*A, k2);
+  const __m128i D = _mm_madd_epi16(*B, k2);
+  *out = _mm_packs_epi32(C, D);
+}
+
+static void ConvertARGBToUV_SSE41(const uint32_t* argb,
+                                  uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store) {
+  const int max_width = src_width & ~31;
+  int i;
+  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
+    __m128i rgb[6], U0, V0, U1, V1;
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+
+    U0 = _mm_packus_epi16(U0, U1);
+    V0 = _mm_packus_epi16(V0, V1);
+    if (!do_store) {
+      const __m128i prev_u = LOAD_16(u);
+      const __m128i prev_v = LOAD_16(v);
+      U0 = _mm_avg_epu8(U0, prev_u);
+      V0 = _mm_avg_epu8(V0, prev_v);
+    }
+    STORE_16(U0, u);
+    STORE_16(V0, v);
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+// Convert 16 packed ARGB 16b-values to r[], g[], b[]
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
+    const uint16_t* const rgbx,
+    __m128i* const r, __m128i* const g, __m128i* const b) {
+  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
+  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
+  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
+  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
+  // aarrggbb as 16-bit.
+  const __m128i shuff0 =
+      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i shuff1 =
+      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
+  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
+  // R0R1G0G1
+  // B0B1****
+  // R2R3G2G3
+  // B2B3****
+  // (OR is used to free port 5 for the unpack)
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_or_si128(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_or_si128(A2, A3);
+  // Gather the channels.
+  *r = _mm_unpacklo_epi64(B0, B2);
+  *g = _mm_unpackhi_epi64(B0, B2);
+  *b = _mm_unpackhi_epi64(B1, B3);
+}
+
+static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width) {
+  const int max_width = width & ~15;
+  const uint16_t* const last_rgb = rgb + 4 * max_width;
+  while (rgb < last_rgb) {
+    __m128i r, g, b, U0, V0, U1, V1;
+    RGBA32PackedToPlanar_16b_SSE41(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
+    STORE_16(_mm_packus_epi16(U0, U1), u);
+    STORE_16(_mm_packus_epi16(V0, V1), v);
+    u += 16;
+    v += 16;
+    rgb += 2 * 32;
+  }
+  if (max_width < width) {  // left-over
+    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
+  WebPConvertARGBToY = ConvertARGBToY_SSE41;
+  WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
+
+  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
+
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
+}
+
+//------------------------------------------------------------------------------
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
+
+#endif  // WEBP_USE_SSE41
--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@ -10,8 +10,6 @@ libwebpencode_la_SOURCES += backward_references_enc.h
 libwebpencode_la_SOURCES += config_enc.c
 libwebpencode_la_SOURCES += cost_enc.c
 libwebpencode_la_SOURCES += cost_enc.h
-libwebpencode_la_SOURCES += delta_palettization_enc.c
-libwebpencode_la_SOURCES += delta_palettization_enc.h
 libwebpencode_la_SOURCES += filter_enc.c
 libwebpencode_la_SOURCES += frame_enc.c
 libwebpencode_la_SOURCES += histogram_enc.c
@ -40,5 +38,5 @@ noinst_HEADERS =
 noinst_HEADERS += ../webp/format_constants.h

 libwebpencode_la_LDFLAGS = -lm
-libwebpencode_la_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
+libwebpencode_la_CPPFLAGS = $(AM_CPPFLAGS)
 libwebpencodeincludedir = $(includedir)/webp
--- a/src/enc/alpha_enc.c
+++ b/src/enc/alpha_enc.c
@ -361,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls

-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* dummy) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
  const WebPConfig* config = enc->config_;
  uint8_t* alpha_data = NULL;
  size_t alpha_size = 0;
@ -394,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
    WebPGetWorkerInterface()->Init(worker);
    worker->data1 = enc;
    worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+    worker->hook = CompressAlphaJob;
  }
 }

--- a/src/enc/analysis_enc.c
+++ b/src/enc/analysis_enc.c
@ -434,7 +434,9 @@ typedef struct {
 } SegmentJob;

 // main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
  int ok = 1;
  if (!VP8IteratorIsDone(it)) {
    uint8_t tmp[32 + WEBP_ALIGN_CST];
@ -462,7 +464,7 @@ static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
  WebPGetWorkerInterface()->Init(&job->worker);
  job->worker.data1 = job;
  job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  job->worker.hook = DoSegmentsJob;
  VP8IteratorInit(enc, &job->it);
  VP8IteratorSetRow(&job->it, start_row);
  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
--- a/src/enc/delta_palettization_enc.c
+++ b/src/enc/delta_palettization_enc.c
@ -1,455 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#include "src/enc/delta_palettization_enc.h"
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/webp/types.h"
-#include "src/dsp/lossless.h"
-
-#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
-
-// Format allows palette up to 256 entries, but more palette entries produce
-// bigger entropy. In the future it will probably be useful to add more entries
-// that are far from the origin of the palette or choose remaining entries
-// dynamically.
-#define DELTA_PALETTE_SIZE 226
-
-// Palette used for delta_palettization. Entries are roughly sorted by distance
-// of their signed equivalents from the origin.
-static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
-  MK_COL(0u, 0u, 0u),
-  MK_COL(255u, 255u, 255u),
-  MK_COL(1u, 1u, 1u),
-  MK_COL(254u, 254u, 254u),
-  MK_COL(2u, 2u, 2u),
-  MK_COL(4u, 4u, 4u),
-  MK_COL(252u, 252u, 252u),
-  MK_COL(250u, 0u, 0u),
-  MK_COL(0u, 250u, 0u),
-  MK_COL(0u, 0u, 250u),
-  MK_COL(6u, 0u, 0u),
-  MK_COL(0u, 6u, 0u),
-  MK_COL(0u, 0u, 6u),
-  MK_COL(0u, 0u, 248u),
-  MK_COL(0u, 0u, 8u),
-  MK_COL(0u, 248u, 0u),
-  MK_COL(0u, 248u, 248u),
-  MK_COL(0u, 248u, 8u),
-  MK_COL(0u, 8u, 0u),
-  MK_COL(0u, 8u, 248u),
-  MK_COL(0u, 8u, 8u),
-  MK_COL(8u, 8u, 8u),
-  MK_COL(248u, 0u, 0u),
-  MK_COL(248u, 0u, 248u),
-  MK_COL(248u, 0u, 8u),
-  MK_COL(248u, 248u, 0u),
-  MK_COL(248u, 8u, 0u),
-  MK_COL(8u, 0u, 0u),
-  MK_COL(8u, 0u, 248u),
-  MK_COL(8u, 0u, 8u),
-  MK_COL(8u, 248u, 0u),
-  MK_COL(8u, 8u, 0u),
-  MK_COL(23u, 23u, 23u),
-  MK_COL(13u, 13u, 13u),
-  MK_COL(232u, 232u, 232u),
-  MK_COL(244u, 244u, 244u),
-  MK_COL(245u, 245u, 250u),
-  MK_COL(50u, 50u, 50u),
-  MK_COL(204u, 204u, 204u),
-  MK_COL(236u, 236u, 236u),
-  MK_COL(16u, 16u, 16u),
-  MK_COL(240u, 16u, 16u),
-  MK_COL(16u, 240u, 16u),
-  MK_COL(240u, 240u, 16u),
-  MK_COL(16u, 16u, 240u),
-  MK_COL(240u, 16u, 240u),
-  MK_COL(16u, 240u, 240u),
-  MK_COL(240u, 240u, 240u),
-  MK_COL(0u, 0u, 232u),
-  MK_COL(0u, 232u, 0u),
-  MK_COL(232u, 0u, 0u),
-  MK_COL(0u, 0u, 24u),
-  MK_COL(0u, 24u, 0u),
-  MK_COL(24u, 0u, 0u),
-  MK_COL(32u, 32u, 32u),
-  MK_COL(224u, 32u, 32u),
-  MK_COL(32u, 224u, 32u),
-  MK_COL(224u, 224u, 32u),
-  MK_COL(32u, 32u, 224u),
-  MK_COL(224u, 32u, 224u),
-  MK_COL(32u, 224u, 224u),
-  MK_COL(224u, 224u, 224u),
-  MK_COL(0u, 0u, 176u),
-  MK_COL(0u, 0u, 80u),
-  MK_COL(0u, 176u, 0u),
-  MK_COL(0u, 176u, 176u),
-  MK_COL(0u, 176u, 80u),
-  MK_COL(0u, 80u, 0u),
-  MK_COL(0u, 80u, 176u),
-  MK_COL(0u, 80u, 80u),
-  MK_COL(176u, 0u, 0u),
-  MK_COL(176u, 0u, 176u),
-  MK_COL(176u, 0u, 80u),
-  MK_COL(176u, 176u, 0u),
-  MK_COL(176u, 80u, 0u),
-  MK_COL(80u, 0u, 0u),
-  MK_COL(80u, 0u, 176u),
-  MK_COL(80u, 0u, 80u),
-  MK_COL(80u, 176u, 0u),
-  MK_COL(80u, 80u, 0u),
-  MK_COL(0u, 0u, 152u),
-  MK_COL(0u, 0u, 104u),
-  MK_COL(0u, 152u, 0u),
-  MK_COL(0u, 152u, 152u),
-  MK_COL(0u, 152u, 104u),
-  MK_COL(0u, 104u, 0u),
-  MK_COL(0u, 104u, 152u),
-  MK_COL(0u, 104u, 104u),
-  MK_COL(152u, 0u, 0u),
-  MK_COL(152u, 0u, 152u),
-  MK_COL(152u, 0u, 104u),
-  MK_COL(152u, 152u, 0u),
-  MK_COL(152u, 104u, 0u),
-  MK_COL(104u, 0u, 0u),
-  MK_COL(104u, 0u, 152u),
-  MK_COL(104u, 0u, 104u),
-  MK_COL(104u, 152u, 0u),
-  MK_COL(104u, 104u, 0u),
-  MK_COL(216u, 216u, 216u),
-  MK_COL(216u, 216u, 40u),
-  MK_COL(216u, 216u, 176u),
-  MK_COL(216u, 216u, 80u),
-  MK_COL(216u, 40u, 216u),
-  MK_COL(216u, 40u, 40u),
-  MK_COL(216u, 40u, 176u),
-  MK_COL(216u, 40u, 80u),
-  MK_COL(216u, 176u, 216u),
-  MK_COL(216u, 176u, 40u),
-  MK_COL(216u, 176u, 176u),
-  MK_COL(216u, 176u, 80u),
-  MK_COL(216u, 80u, 216u),
-  MK_COL(216u, 80u, 40u),
-  MK_COL(216u, 80u, 176u),
-  MK_COL(216u, 80u, 80u),
-  MK_COL(40u, 216u, 216u),
-  MK_COL(40u, 216u, 40u),
-  MK_COL(40u, 216u, 176u),
-  MK_COL(40u, 216u, 80u),
-  MK_COL(40u, 40u, 216u),
-  MK_COL(40u, 40u, 40u),
-  MK_COL(40u, 40u, 176u),
-  MK_COL(40u, 40u, 80u),
-  MK_COL(40u, 176u, 216u),
-  MK_COL(40u, 176u, 40u),
-  MK_COL(40u, 176u, 176u),
-  MK_COL(40u, 176u, 80u),
-  MK_COL(40u, 80u, 216u),
-  MK_COL(40u, 80u, 40u),
-  MK_COL(40u, 80u, 176u),
-  MK_COL(40u, 80u, 80u),
-  MK_COL(80u, 216u, 216u),
-  MK_COL(80u, 216u, 40u),
-  MK_COL(80u, 216u, 176u),
-  MK_COL(80u, 216u, 80u),
-  MK_COL(80u, 40u, 216u),
-  MK_COL(80u, 40u, 40u),
-  MK_COL(80u, 40u, 176u),
-  MK_COL(80u, 40u, 80u),
-  MK_COL(80u, 176u, 216u),
-  MK_COL(80u, 176u, 40u),
-  MK_COL(80u, 176u, 176u),
-  MK_COL(80u, 176u, 80u),
-  MK_COL(80u, 80u, 216u),
-  MK_COL(80u, 80u, 40u),
-  MK_COL(80u, 80u, 176u),
-  MK_COL(80u, 80u, 80u),
-  MK_COL(0u, 0u, 192u),
-  MK_COL(0u, 0u, 64u),
-  MK_COL(0u, 0u, 128u),
-  MK_COL(0u, 192u, 0u),
-  MK_COL(0u, 192u, 192u),
-  MK_COL(0u, 192u, 64u),
-  MK_COL(0u, 192u, 128u),
-  MK_COL(0u, 64u, 0u),
-  MK_COL(0u, 64u, 192u),
-  MK_COL(0u, 64u, 64u),
-  MK_COL(0u, 64u, 128u),
-  MK_COL(0u, 128u, 0u),
-  MK_COL(0u, 128u, 192u),
-  MK_COL(0u, 128u, 64u),
-  MK_COL(0u, 128u, 128u),
-  MK_COL(176u, 216u, 216u),
-  MK_COL(176u, 216u, 40u),
-  MK_COL(176u, 216u, 176u),
-  MK_COL(176u, 216u, 80u),
-  MK_COL(176u, 40u, 216u),
-  MK_COL(176u, 40u, 40u),
-  MK_COL(176u, 40u, 176u),
-  MK_COL(176u, 40u, 80u),
-  MK_COL(176u, 176u, 216u),
-  MK_COL(176u, 176u, 40u),
-  MK_COL(176u, 176u, 176u),
-  MK_COL(176u, 176u, 80u),
-  MK_COL(176u, 80u, 216u),
-  MK_COL(176u, 80u, 40u),
-  MK_COL(176u, 80u, 176u),
-  MK_COL(176u, 80u, 80u),
-  MK_COL(192u, 0u, 0u),
-  MK_COL(192u, 0u, 192u),
-  MK_COL(192u, 0u, 64u),
-  MK_COL(192u, 0u, 128u),
-  MK_COL(192u, 192u, 0u),
-  MK_COL(192u, 192u, 192u),
-  MK_COL(192u, 192u, 64u),
-  MK_COL(192u, 192u, 128u),
-  MK_COL(192u, 64u, 0u),
-  MK_COL(192u, 64u, 192u),
-  MK_COL(192u, 64u, 64u),
-  MK_COL(192u, 64u, 128u),
-  MK_COL(192u, 128u, 0u),
-  MK_COL(192u, 128u, 192u),
-  MK_COL(192u, 128u, 64u),
-  MK_COL(192u, 128u, 128u),
-  MK_COL(64u, 0u, 0u),
-  MK_COL(64u, 0u, 192u),
-  MK_COL(64u, 0u, 64u),
-  MK_COL(64u, 0u, 128u),
-  MK_COL(64u, 192u, 0u),
-  MK_COL(64u, 192u, 192u),
-  MK_COL(64u, 192u, 64u),
-  MK_COL(64u, 192u, 128u),
-  MK_COL(64u, 64u, 0u),
-  MK_COL(64u, 64u, 192u),
-  MK_COL(64u, 64u, 64u),
-  MK_COL(64u, 64u, 128u),
-  MK_COL(64u, 128u, 0u),
-  MK_COL(64u, 128u, 192u),
-  MK_COL(64u, 128u, 64u),
-  MK_COL(64u, 128u, 128u),
-  MK_COL(128u, 0u, 0u),
-  MK_COL(128u, 0u, 192u),
-  MK_COL(128u, 0u, 64u),
-  MK_COL(128u, 0u, 128u),
-  MK_COL(128u, 192u, 0u),
-  MK_COL(128u, 192u, 192u),
-  MK_COL(128u, 192u, 64u),
-  MK_COL(128u, 192u, 128u),
-  MK_COL(128u, 64u, 0u),
-  MK_COL(128u, 64u, 192u),
-  MK_COL(128u, 64u, 64u),
-  MK_COL(128u, 64u, 128u),
-  MK_COL(128u, 128u, 0u),
-  MK_COL(128u, 128u, 192u),
-  MK_COL(128u, 128u, 64u),
-  MK_COL(128u, 128u, 128u),
-};
-
-#undef MK_COL
-
-//------------------------------------------------------------------------------
-// TODO(skal): move the functions to dsp/lossless.c when the correct
-// granularity is found. For now, we'll just copy-paste some useful bits
-// here instead.
-
-// In-place sum of each component with mod 256.
-static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-// Delta palettization functions.
-static WEBP_INLINE int Square(int x) {
-  return x * x;
-}
-
-static WEBP_INLINE uint32_t Intensity(uint32_t a) {
-  return
-      30 * ((a >> 16) & 0xff) +
-      59 * ((a >>  8) & 0xff) +
-      11 * ((a >>  0) & 0xff);
-}
-
-static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
-                         uint32_t palette_entry) {
-  int i;
-  uint32_t distance = 0;
-  AddPixelsEq(&predicted_value, palette_entry);
-  for (i = 0; i < 32; i += 8) {
-    const int32_t av = (actual_value >> i) & 0xff;
-    const int32_t pv = (predicted_value >> i) & 0xff;
-    distance += Square(pv - av);
-  }
-  // We sum square of intensity difference with factor 10, but because Intensity
-  // returns 100 times real intensity we need to multiply differences of colors
-  // by 1000.
-  distance *= 1000u;
-  distance += Square(Intensity(predicted_value)
-                     - Intensity(actual_value));
-  return distance;
-}
-
-static uint32_t Predict(int x, int y, uint32_t* image) {
-  const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
-  const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
-  const uint32_t p =
-      (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
-      (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
-      (((((t >>  8) & 0xff) + ((l >>  8) & 0xff)) / 2) <<  8) +
-      (((((t >>  0) & 0xff) + ((l >>  0) & 0xff)) / 2) <<  0);
-  if (x == 0 && y == 0) return ARGB_BLACK;
-  if (x == 0) return t;
-  if (y == 0) return l;
-  return p;
-}
-
-static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
-    int a, int b, int c) {
-  return Clip255(a + ((b - c) >> 2));
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
-    uint32_t c0, uint32_t c1, uint32_t c2) {
-  const int a = AddSubtractComponentFullWithCoefficient(
-      c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
-                                                       (c1 >> 16) & 0xff,
-                                                       (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
-                                                       (c1 >> 8) & 0xff,
-                                                       (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFullWithCoefficient(
-      c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-//------------------------------------------------------------------------------
-
-// Find palette entry with minimum error from difference of actual pixel value
-// and predicted pixel value. Propagate error of pixel to its top and left pixel
-// in src array. Write predicted_value + palette_entry to new_image. Return
-// index of best palette entry.
-static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
-                                const uint32_t palette[], int palette_size) {
-  int i;
-  int idx = 0;
-  uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
-  for (i = 1; i < palette_size; ++i) {
-    const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
-    if (distance < best_distance) {
-      best_distance = distance;
-      idx = i;
-    }
-  }
-  return idx;
-}
-
-static void ApplyBestPaletteEntry(int x, int y,
-                                  uint32_t new_value, uint32_t palette_value,
-                                  uint32_t* src, int src_stride,
-                                  uint32_t* new_image) {
-  AddPixelsEq(&new_value, palette_value);
-  if (x > 0) {
-    src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
-                                                       new_value, src[x]);
-  }
-  if (y > 0) {
-    src[x - src_stride] =
-        ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
-                                              new_value, src[x]);
-  }
-  new_image[x] = new_value;
-}
-
-//------------------------------------------------------------------------------
-// Main entry point
-
-static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
-                                           uint32_t src_stride,
-                                           uint32_t dst_stride,
-                                           const uint32_t* palette,
-                                           int palette_size,
-                                           int width, int height,
-                                           int num_passes) {
-  int x, y;
-  WebPEncodingError err = VP8_ENC_OK;
-  uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
-  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  if (new_image == NULL || tmp_row == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-
-  while (num_passes--) {
-    uint32_t* cur_src = src;
-    uint32_t* cur_dst = dst;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t predicted_value = Predict(x, y, new_image);
-        tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
-                                          palette, palette_size);
-        ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
-                              cur_src, src_stride, new_image);
-      }
-      for (x = 0; x < width; ++x) {
-        cur_dst[x] = palette[tmp_row[x]];
-      }
-      cur_src += src_stride;
-      cur_dst += dst_stride;
-    }
-  }
- Error:
-  WebPSafeFree(new_image);
-  WebPSafeFree(tmp_row);
-  return err;
-}
-
-// replaces enc->argb_ by a palettizable approximation of it,
-// and generates optimal enc->palette_[]
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst = enc->argb_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  WebPEncodingError err = VP8_ENC_OK;
-  memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
-  enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
-  enc->palette_size_ = DELTA_PALETTE_SIZE;
-  err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
-                          enc->palette_, enc->palette_size_,
-                          width, height, 2);
-  if (err != VP8_ENC_OK) goto Error;
-
- Error:
-  return err;
-}
-
-#else  // !WEBP_EXPERIMENTAL_FEATURES
-
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  (void)enc;
-  return VP8_ENC_ERROR_INVALID_CONFIGURATION;
-}
-
-#endif  // WEBP_EXPERIMENTAL_FEATURES
--- a/src/enc/delta_palettization_enc.h
+++ b/src/enc/delta_palettization_enc.h
@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-
-#include "src/webp/encode.h"
-#include "src/enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
--- a/src/enc/frame_enc.c
+++ b/src/enc/frame_enc.c
@ -198,7 +198,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {

  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
    const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
+    ++p[mb->segment_];
  }
 #if !defined(WEBP_DISABLE_STATS)
  if (enc->pic_->stats != NULL) {
@ -520,6 +520,14 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }

+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
 #else  // defined(WEBP_DISABLE_STATS)
 static void ResetSSE(VP8Encoder* const enc) {
  (void)enc;
@ -528,10 +536,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
  VP8Encoder* const enc = it->enc_;
  WebPPicture* const pic = enc->pic_;
  if (pic->extra_info != NULL) {
-    memset(pic->extra_info, 0,
-           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
  }
 }
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
 #endif  // !defined(WEBP_DISABLE_STATS)

 static double GetPSNR(uint64_t mse, uint64_t size) {
@ -570,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
    VP8IteratorImport(&it, NULL);
    if (VP8Decimate(&it, &info, rd_opt)) {
      // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+      ++enc->proba_.nb_skip_;
    }
    RecordResiduals(&it, &info);
    size += info.R + info.H;
@ -841,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
      ++num_pass_left;
      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
      continue;                        // ...and start over
    }
    if (is_last_pass) {
@ -871,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 #endif    // DISABLE_TOKEN_BUFFER

 //------------------------------------------------------------------------------
-
--- a/src/enc/histogram_enc.c
+++ b/src/enc/histogram_enc.c
@ -200,14 +200,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
  }
 }

-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol) {
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
  VP8LBitEntropy entropy;
  VP8LBitsEntropyUnrefined(array, n, &entropy);
-  if (trivial_symbol != NULL) {
-    *trivial_symbol =
-        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
-  }

  return BitsEntropyRefine(&entropy);
 }
@ -605,7 +600,7 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
 }

 // Implement a Lehmer random number generator with a multiplicative constant of
-// 48271 and a modulo constant of 2^31 − 1.
+// 48271 and a modulo constant of 2^31 - 1.
 static uint32_t MyRand(uint32_t* const seed) {
  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
  assert(*seed > 0);
@ -1031,7 +1026,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
    }
  }

-  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
+  // TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
  // Find the optimal map from original histograms to the final ones.
  HistogramRemap(orig_histo, image_histo, histogram_symbols);

--- a/src/enc/histogram_enc.h
+++ b/src/enc/histogram_enc.h
@ -109,10 +109,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             uint16_t* const histogram_symbols);

 // Returns the entropy for the symbols in the input array.
-// Also sets trivial_symbol to the code value, if the array has only one code
-// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol);
+double VP8LBitsEntropy(const uint32_t* const array, int n);

 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
--- a/src/enc/iterator_enc.c
+++ b/src/enc/iterator_enc.c
@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
  memset(it->u_left_, 129, 8);
  memset(it->v_left_, 129, 8);
  it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
 }

 static void InitTop(VP8EncIterator* const it) {
@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
  const size_t top_size = enc->mb_w_ * 16;
  memset(enc->y_top_, 127, 2 * top_size);
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
 }

 void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
  it->u_left_ = it->y_left_ + 16 + 16;
  it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
  VP8IteratorReset(it);
 }

@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 }

 //------------------------------------------------------------------------------
-
--- a/src/enc/picture_csp_enc.c
+++ b/src/enc/picture_csp_enc.c
@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W

 #if defined(USE_GAMMA_COMPRESSION)

-// float variant of gamma-correction
 // We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
-static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
-static float kLinearToGammaTabF[kGammaTabSize + 2];
-static volatile int kGammaTablesFOk = 0;
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
-  if (!kGammaTablesFOk) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
    int v;
    const double norm = 1. / MAX_Y_T;
    const double scale = 1. / kGammaTabSize;
    const double a = 0.09929682680944;
    const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
    for (v = 0; v <= MAX_Y_T; ++v) {
      const double g = norm * v;
+      double value;
      if (g <= thresh * 4.5) {
-        kGammaToLinearTabF[v] = (float)(g / 4.5);
+        value = g / 4.5;
      } else {
        const double a_rec = 1. / (1. + a);
-        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+        value = pow(a_rec * (g + a), kGammaF);
      }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
    }
    for (v = 0; v <= kGammaTabSize; ++v) {
      const double g = scale * v;
@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
      } else {
        value = (1. + a) * pow(g, 1. / kGammaF) - a;
      }
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
    }
    // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
-    kGammaTablesFOk = 1;
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
  }
 }

-static WEBP_INLINE float GammaToLinearF(int v) {
-  return kGammaToLinearTabF[v];
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
 }

-static WEBP_INLINE int LinearToGammaF(float value) {
-  const float v = value * kGammaTabSize;
-  const int tab_pos = (int)v;
-  const float x = v - (float)tab_pos;      // fractional part
-  const float v0 = kLinearToGammaTabF[tab_pos + 0];
-  const float v1 = kLinearToGammaTabF[tab_pos + 1];
-  const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return (int)(y + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
 }

 #else

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
-static WEBP_INLINE float GammaToLinearF(int v) {
-  const float norm = 1.f / MAX_Y_T;
-  return norm * v;
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
 }
-static WEBP_INLINE int LinearToGammaF(float value) {
-  return (int)(MAX_Y_T * value + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
 }

 #endif    // USE_GAMMA_COMPRESSION
@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) {
  return (luma >> YUV_FIX);
 }

-static float RGBToGrayF(float r, float g, float b) {
-  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
-}
-
-static int ScaleDown(int a, int b, int c, int d) {
-  const float A = GammaToLinearF(a);
-  const float B = GammaToLinearF(b);
-  const float C = GammaToLinearF(c);
-  const float D = GammaToLinearF(d);
-  return LinearToGammaF(0.25f * (A + B + C + D));
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
 }

 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
  int i;
  for (i = 0; i < w; ++i) {
-    const float R = GammaToLinearF(src[0 * w + i]);
-    const float G = GammaToLinearF(src[1 * w + i]);
-    const float B = GammaToLinearF(src[2 * w + i]);
-    const float Y = RGBToGrayF(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaF(Y);
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
  }
 }

@ -863,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
  }

  if (use_iterative_conversion) {
-    InitGammaTablesF();
+    InitGammaTablesS();
    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
      return 0;
    }
@ -1110,10 +1117,14 @@ static int Import(WebPPicture* const picture,
    uint32_t* dst = picture->argb;
    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
    assert(step == 4);
-    for (y = 0; y < height; ++y) {
-      if (do_copy) {
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
        memcpy(dst, rgb, width * 4);
-      } else {
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
 #ifdef WORDS_BIGENDIAN
        // BGRA or RGBA input order.
        const uint8_t* a_ptr = rgb + 3;
@ -1125,9 +1136,9 @@ static int Import(WebPPicture* const picture,
        // RGBA input order. Need to swap R and B.
        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
 #endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
      }
-      rgb += rgb_stride;
-      dst += picture->argb_stride;
    }
  } else {
    uint32_t* dst = picture->argb;
--- a/src/enc/picture_psnr_enc.c
+++ b/src/enc/picture_psnr_enc.c
@ -18,6 +18,7 @@
 #include <math.h>
 #include <stdlib.h>

+#include "src/dsp/dsp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"

@ -169,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
  return 1;
 }

+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                          int type, float results[5]) {
  int w, h, c;
@ -195,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
    float distortion;
    const size_t stride0 = 4 * (size_t)p0.argb_stride;
    const size_t stride1 = 4 * (size_t)p1.argb_stride;
-    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
-                             (const uint8_t*)p1.argb + c, stride1,
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
                             w, h, 4, type, &distortion, results + c)) {
      goto Error;
    }
@ -214,6 +223,8 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
  return ok;
 }

+#undef BLUE_OFFSET
+
 #else  // defined(WEBP_DISABLE_STATS)
 int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
                        const uint8_t* ref, size_t ref_stride,
--- a/src/enc/quant_enc.c
+++ b/src/enc/quant_enc.c
@ -826,6 +826,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
  return nz;
 }

+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                         uint8_t* const yuv_out, int mode) {
  const VP8Encoder* const enc = it->enc_;
@ -839,6 +918,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
  for (n = 0; n < 8; n += 2) {
    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
  }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
  if (DO_TRELLIS_UV && it->do_trellis_) {
    int ch, x, y;
    for (ch = 0, n = 0; ch <= 2; ch += 2) {
@ -1101,6 +1182,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
      CopyScore(&rd_best, &rd_uv);
      rd->mode_uv = mode;
      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
      SwapPtr(&dst, &tmp_dst);
    }
  }
@ -1109,6 +1193,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  if (dst != dst0) {   // copy 16x8 block if needed
    VP8Copy16x8(dst, dst0);
  }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }

 //------------------------------------------------------------------------------
--- a/src/enc/vp8i_enc.h
+++ b/src/enc/vp8i_enc.h
@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER

+// quality below which error-diffusion is enabled
+#define ERROR_DIFFUSION_QUALITY 98
+
 //------------------------------------------------------------------------------
 // Headers

@ -201,6 +204,8 @@ typedef struct {
  score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;

+typedef int8_t DError[2 /* u/v */][2 /* top or left */];
+
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
@ -213,6 +218,7 @@ typedef struct {
  uint8_t modes_i4[16];       // mode numbers for intra4 predictions
  int mode_uv;                // mode number of chroma prediction
  uint32_t nz;                // non-zero blocks
+  int8_t derr[2][3];          // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;

 // Iterator structure to iterate through macroblocks, pointing to the
@ -242,6 +248,9 @@ typedef struct {
  int           count_down0_;      // starting counter value (for progress)
  int           percent0_;         // saved initial progress percent

+  DError        left_derr_;        // left error diffusion (u/v)
+  DError       *top_derr_;         // top diffusion error - NULL if disabled
+
  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
@ -401,6 +410,7 @@ struct VP8Encoder {
  uint8_t*   uv_top_;    // top u/v samples.
                         // U and V are packed into 16 bytes (8 U + 8 V)
  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  DError*    top_derr_;  // diffusion error (NULL if disabled)
 };

 //------------------------------------------------------------------------------
--- a/src/enc/vp8l_enc.c
+++ b/src/enc/vp8l_enc.c
@ -26,8 +26,6 @@
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"

-#include "src/enc/delta_palettization_enc.h"
-
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600

@ -259,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
      ++histo[kHistoAlphaPred * 256];

      for (j = 0; j < kHistoTotal; ++j) {
-        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
      }
      entropy[kDirect] = entropy_comp[kHistoAlpha] +
          entropy_comp[kHistoRed] +
@ -384,8 +382,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
      AnalyzeAndCreatePalette(pic, low_effort,
                              enc->palette_, &enc->palette_size_);

-  // TODO(jyrki): replace the decision to be based on an actual estimate
-  // of entropy, or even spatial variance of entropy.
+  // Empirical bit sizes.
  enc->histo_bits_ = GetHistoBits(method, use_palette,
                                  pic->width, pic->height);
  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
@ -756,7 +753,6 @@ static WebPEncodingError StoreImageToBitMask(
      // Don't write the distance with the extra bits code since
      // the distance can be up to 18 bits of extra bits, and the prefix
      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
-      // TODO(jyrki): optimize this further.
      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes + 4, code);
      VP8LPutBits(bw, bits, n_bits);
@ -1464,49 +1460,6 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                              20 /* quality */, low_effort);
 }

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
-    int low_effort) {
-  const WebPPicture* const pic = enc->pic_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  const int pred_bits = 5;
-  const int transform_width = VP8LSubSampleSize(width, pred_bits);
-  const int transform_height = VP8LSubSampleSize(height, pred_bits);
-  const int pred = 7;   // default is Predictor7 (Top/Left Average)
-  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
-  uint32_t* predictors;
-  int tile_x, tile_y;
-  WebPEncodingError err = VP8_ENC_OK;
-
-  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
-                                         sizeof(*predictors));
-  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
-    }
-  }
-
-  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
-  VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(
-      bw, predictors, &enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1],
-      transform_width, transform_height, quality, low_effort);
-  WebPSafeFree(predictors);
-  return err;
-}
-
-#endif // WEBP_EXPERIMENTAL_FEATURES
-
 // -----------------------------------------------------------------------------
 // VP8LEncoder

@ -1568,7 +1521,7 @@ static int EncodeStreamHook(void* input, void* data2) {
  WebPEncodingError err = VP8_ENC_OK;
  const int quality = (int)config->quality;
  const int low_effort = (config->method == 0);
-#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
+#if (WEBP_NEAR_LOSSLESS == 1)
  const int width = picture->width;
 #endif
  const int height = picture->height;
@ -1627,29 +1580,6 @@ static int EncodeStreamHook(void* input, void* data2) {
    enc->argb_content_ = kEncoderNone;
 #endif

-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (config->use_delta_palette) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 0;
-      enc->use_subtract_green_ = 0;
-      enc->use_palette_ = 1;
-      if (enc->argb_content_ != kEncoderNearLossless &&
-          enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
-      }
-      err = WebPSearchOptimalDeltaPalette(enc);
-      if (err != VP8_ENC_OK) goto Error;
-      if (enc->use_palette_) {
-        err = AllocateTransformBuffer(enc, width, height);
-        if (err != VP8_ENC_OK) goto Error;
-        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
-        if (err != VP8_ENC_OK) goto Error;
-        use_delta_palette = 1;
-      }
-    }
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
    // Encode palette
    if (enc->use_palette_) {
      err = EncodePalette(bw, low_effort, enc);
@ -1822,7 +1752,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
      worker_interface->Init(worker);
      worker->data1 = param;
      worker->data2 = NULL;
-      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+      worker->hook = EncodeStreamHook;
    }
  }

@ -1944,7 +1874,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
  err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
  if (err != VP8_ENC_OK) goto Error;

-  // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
  if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;

  // Finish the RIFF chunk.
--- a/src/enc/webp_enc.c
+++ b/src/enc/webp_enc.c
@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
      + WEBP_ALIGN_CST;                      // align all
  const size_t lf_stats_size =
      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
  uint8_t* mem;
  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
                      + WEBP_ALIGN_CST           // cache alignment
                      + info_size                // modes info
                      + preds_size               // prediction modes
                      + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
                      + nz_size                  // coeff context bits
                      + lf_stats_size;           // autofilter stats

@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
-         preds_size, samples_size, nz_size, lf_stats_size, size);
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
  printf("Transient object sizes:\n"
         "      VP8EncIterator: %ld\n"
         "        VP8ModeScore: %ld\n"
@ -219,6 +224,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->y_top_ = mem;
  enc->uv_top_ = enc->y_top_ + top_stride;
  mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
  assert(mem <= (uint8_t*)enc + size);

  enc->config_ = config;
Author	SHA1	Message	Date
James Zern	e122e511cf	cwebp,get_disto: fix bpp output bits-per-pixel were intended, not bytes-per-pixel Change-Id: I023349013ac5956154ab4526bd1e195dfe95b8ab	2018-04-10 15:51:23 -07:00
Vincent Rabaud	1c8f358df4	Fix CMake with WASM. Change-Id: I21c78999484815c58b5b2c9795310da9d04888e9	2018-04-05 16:25:15 +02:00
Pascal Massimino	a0215fb7dc	webp_js: fix webp_js demo html We need to export 'Module.cwrap' method. Change-Id: I2986c5a4c06630ae3f95086a114e727a86c99a2b	2018-04-05 16:25:15 +02:00
James Zern	abb4776006	Merge "Use proper targets for CMake."	2018-04-04 01:35:03 +00:00
Vincent Rabaud	e155dda0cc	Use proper targets for CMake. Also fix the bug where near lossless was not used and allow examples to be built by default. Change-Id: Ieb5ef77fafe83f3776ff4fd27a6d26534c7a51f3	2018-04-03 17:49:08 -07:00
Pascal Massimino	994be82d00	Merge "Remove some very hard TODOs."	2018-04-03 14:10:50 +00:00
Vincent Rabaud	4033e1d70d	Remove some very hard TODOs. Change-Id: I3d1b0072e0ac9125840fbbd76e91d151c82489ec	2018-04-03 15:17:21 +02:00
Vlad Tsyrklevich	978eec2507	[CFI] Remove function pointer casts Control Flow Integrity [1] indirect call checking verifies that function pointers only call valid functions with a matching type signature. This change eliminates function pointer casts that were causing cfi-icall failures. [1] https://www.chromium.org/developers/testing/control-flow-integrity BUG=chromium:827826 Change-Id: I5db021d06390a6cefd670fdd2f0d34c9e530465e	2018-04-02 16:04:47 -07:00
James Zern	c909d53182	Merge "remove some deprecation warning on MacOSX"	2018-03-31 08:09:41 +00:00
Pascal Massimino	217443c71a	remove some deprecation warning on MacOSX Change-Id: I3e6de5ae6d84aa7906049eb55b6373c5123b0158	2018-03-31 09:30:00 +02:00
James Zern	b672bdfaad	configure: quiet glut deprecation warnings on OS X BUG=webp:187 Change-Id: Iad88b5fe417289f00dedcc32e7672fc0898e9ed1	2018-03-31 00:24:53 -07:00
James Zern	daa9fcaf5b	configure: use sdl-config if available + do a full link to ensure SDL_main is resolved if needed fixes detection on OS X BUG=webp:366 Change-Id: Id53329f5d1c2536c4584be61c6379fa76ff0e5de	2018-03-30 23:30:43 -07:00
James Zern	dd174caeff	Merge "imagedec: support metadata reading for WebP image decoding"	2018-03-30 03:18:39 +00:00
Pascal Massimino	641cedccd3	imagedec: support metadata reading for WebP image decoding Needs to link imagedec.a to demux/libwebpdemux.a Change-Id: Id8f4068718b0e4a1e84607bccd5af5419120c231	2018-03-28 16:36:37 -07:00
James Zern	065b2ce10e	anim_diff: add a couple missing newlines in Help() Change-Id: Iad2a4ad2555718f1822b82d38219940ed49385b8	2018-03-26 21:37:29 -07:00
James Zern	c4cc114785	Merge "gif2webp: force low duration frames to 100ms"	2018-03-27 03:11:48 +00:00
James Zern	09333097ed	gif2webp: force low duration frames to 100ms this is consistent with web browser behavior as well as various transcoding tools (ffmpeg, gif2apng, etc). also: update anim_diff to account for this new behaviour. BUG=webp:379 Change-Id: I70cc72a6b401ef32b73cd182a3f12d993d495bf4	2018-03-24 08:26:53 +01:00
Pascal Massimino	e03f0ec319	sharp_yuv: use 14b fixed-point precision for gamma Output is <.1% difference in size, randomly. Speed is 30-50% faster (-m 0 -sharp_yuv). It also gives the exact same output on ARM and x86, because floats are no longer used. Change-Id: Id0f0aa748cc4fc0b82bac1fc5ca954775a0a1b7c	2018-03-23 20:19:28 +01:00
James Zern	b2db361ca6	image_enc,WebPWritePNG: move locals after setjmp this quiets a -Wclobbered warning on const has_alpha under gcc-7 and brings the variables closer to their first use. Change-Id: I8a24f275b7ff34a94d47b576bcf276dbedac2121	2018-03-08 22:58:11 -08:00
Pascal Massimino	74e82ec64a	Merge "WebPPictureDistortion: fix big-endian results order"	2018-02-20 21:20:45 +00:00
James Zern	645d04ca7f	Merge "cwebp,get_disto: report bpp"	2018-02-20 19:56:57 +00:00
James Zern	120f58c3aa	Merge "lossless*sse2: improve non-const 16-bit vector creation"	2018-02-20 19:56:07 +00:00
James Zern	a7fe9412d0	WebPPictureDistortion: fix big-endian results order match the little-endian BGRA order Change-Id: Ie8f1ae3100fac478bae13e53121a6af5b2443374	2018-02-20 11:52:41 -08:00
James Zern	e26fe06680	cwebp,get_disto: report bpp Change-Id: Iefbd834baa4f70eb862071a8e4b87f7d30736aa3	2018-02-19 13:38:29 -08:00
James Zern	9df64e28dd	Merge changes Id5b4a1a4,Ia20ce844 * changes: Import: extract condition from loop Import,RGBA: fix for BigEndian import	2018-02-19 04:28:22 +00:00
James Zern	8043504f95	lossless*sse2: improve non-const 16-bit vector creation use _mm_set1_epi32 instead of _mm_set_epi16 with non-const values; reduces shifts and ors. Change-Id: Ie2cb2ab815f642855d03c6f3001223bcac4bd35c	2018-02-17 17:59:20 -08:00
James Zern	1e3dfc48fb	Import: extract condition from loop do_copy is a loop invariant, but based on a variable parameter; it would only be extracted if Import was inlined. Change-Id: Id5b4a1a4a83a4f2083444da4934e4c994df65b44	2018-02-17 13:30:28 -08:00
Pascal Massimino	3b07d32712	Import,RGBA: fix for BigEndian import + simplification of the logic Change-Id: Ia20ce844793ed35ea03a17cef45838f3d0ae4afa	2018-02-17 13:07:58 -08:00
Vincent Rabaud	551948e45f	Remove unused argument in VP8LBitsEntropy. The function is only used once and does not use the extra argument. Change-Id: I9735383784746cb02b5a643b7a4a2037f2874bf9	2018-02-16 16:05:28 +01:00
Pascal Massimino	3005237a5d	ReadWebP: fix for big-endian Change-Id: I36b3c12ccf02eb5dad350c460387c0528fff8df3	2018-02-14 23:39:26 -08:00
Pascal Massimino	499c395a35	Merge "anim_diff: expose the -max_diff option"	2018-02-09 19:39:03 +00:00
Pascal Massimino	f69dcd692a	Merge "remove WEBP_EXPERIMENTAL_FEATURES"	2018-02-09 18:53:59 +00:00
James Zern	07d884d59b	anim_diff: expose the -max_diff option this removes the last remnant of WEBP_EXPERIMENTAL_FEATURES Change-Id: I5952107b5aae60b865f0745e0bb4a7e1663af5aa	2018-02-09 10:32:09 -08:00
James Zern	f4dd92565e	remove WEBP_EXPERIMENTAL_FEATURES the webp bitstream is considered stable at this point Change-Id: I4b13f9ed4c45f63785474b097e96cb7bf651be7b	2018-02-09 10:25:11 -08:00
skal	94a8377b3e	extract the command-line parsing helpers to example_util + make img2webp tool use the text-file parsing option too. Change-Id: I1976e651bbe8b4701abceba89e054b4fb3c35696	2018-02-08 08:11:31 +00:00
skal	fc09e6e252	PNM decoder: prevent unsupported depth=2 PAM case. Change-Id: I8476818908d71498dd80b07dc255aa008ffd16f5	2018-02-07 18:24:01 -08:00
skal	6de58603b7	MIPS64: Fix defined-but-not-used errors with WEBP_REDUCE_CSP BUG=webp:372 Change-Id: Ided3fae748face18138a8050eaced5e0f58120d4	2018-01-30 17:40:09 -08:00
James Zern	cbde5728c8	gif2webp: add support for reading from stdin output to stdout is already supported; this matches [cd]webp BUG=webp:371 Change-Id: Ib1ce1661b16ea792943bca2980f779584e90cc86	2018-01-26 03:13:44 -08:00
Vincent Rabaud	cf1c5054c7	Add an SSE4 version of some lossless color transforms. Change-Id: Ieac094f684116d1292793b2ca321f6f1a69565b5	2018-01-24 14:33:25 +01:00
Vincent Rabaud	45a8b5eb59	Fix lint error with man page. LC_ALL=en_US.UTF-8 MANROFFSEQ='' MANWIDTH=80 \ man --warnings -E UTF-8 -l -Tutf8 -Z img2webp.1 >/dev/null would trigger it. BUG=webp:370 Change-Id: I9543112bc58ac424af86bb65f7d894707a5646c7	2018-01-23 21:18:30 +01:00
Pascal Massimino	cff38e8f4d	Merge "PNG decoder: handle gAMA chunk"	2018-01-06 00:07:58 +00:00
Pascal Massimino	59cb1a48c1	Merge "enable dc error-diffusion always"	2018-01-05 22:51:58 +00:00
Pascal Massimino	78318b30e5	PNG decoder: handle gAMA chunk Apply gamma correction to the decoded RGB values. This handles corner cases where the PNG file doesn't have a standard 1/2.2 gamma value. BUG=webp:369 Change-Id: I9907b6e2c458002de7c26d0b9e416278cca33990	2018-01-05 10:56:52 -08:00
Pascal Massimino	664c21dd01	Merge "remove some TODOs"	2017-12-28 14:39:35 +00:00
Pascal Massimino	815652de03	enable dc error-diffusion always for q<=98, we always enable error diffusion. + reduce storage 2x by using int8_t + make the error diffusion more robust BUG=webp:340,308 Change-Id: I0608df839ff7b64d6843005a0f81d2577143af9e	2017-12-27 20:11:57 +00:00
Pascal Massimino	aec45cec33	remove some TODOs * regarding alpha_data_ used for testing. alpha_data_!=NULL is as close a good test as we'll get. * regarding filter-strength / sharpness forcing no practical use (can be done during encode cycles, for experimentation) * regarding a 'less-complex' filtering: no practical use so far. Next version! Change-Id: If2dfff5818552a7d3e7c23ac08d64fe6d270229c	2017-12-27 17:40:18 +01:00
Pascal Massimino	5715dfce2e	fix block-count[] increment in case of large image For large images overflowing the partition0, we re-do a number of passes but were forgetting to reset the block_count[]. This was leading to incorrect summary. + some cosmetic fixes here and there BUG=webp:355 Change-Id: Ie87158d7f177f8efdca429b146cfcd0e81652d2f	2017-12-27 17:12:58 +01:00
Pascal Massimino	c2d04f3eb2	enable DC error-diffusion always for multi-pass We can't predict if the quality is going to be below the threshold eventually, so we might as well enable it always. Change-Id: I30aedecc8c6d4daf159f6ef152697df0206d1e93	2017-12-12 15:00:45 +01:00
Pascal Massimino	96bf07c560	use DC error diffusion for U/V at low-quality This fixes some color smearing due to heavy quantization. This is only enabled for q <= 30 (cf ERROR_DIFFUSION_QUALITY) Change-Id: I07e83a4d38461357a32c9e214f7eadc6db73baa9	2017-12-11 06:37:40 -08:00
Pascal Massimino	1c59020b93	fix missing sse41 targets in makefile.unix Change-Id: I8c7a39746594caea160c40e25ea22d756ca44e11	2017-12-11 00:00:19 -08:00
James Zern	7a8e814b57	cosmetics: s/color_space/colorspace/ in webpinfo.c, quality_estimate.c. this form is used elsewhere in the codebase Change-Id: I40c8202db51a7356e6a14d7e9b25c68153548438	2017-12-08 12:40:18 -08:00
James Zern	05f6fe24c3	upsampling: rm asserts w/REDUCE_CSP+OMIT_C_CODE with WEBP_NEON_OMIT_C_CODE the default _C functions won't be set and with WEBP_REDUCE_CSP the NEON functions won't be either triggering an assert for an empty table member. BUG=chromium:792627 Change-Id: I8d2d430eaa37bb92885b61a3dd39f961924a8def	2017-12-06 17:09:26 -08:00
Vincent Rabaud	b4cf5597f4	Merge "Upsampling SSE2/SSE4 speedup."	2017-12-06 10:10:12 +00:00
James Zern	ccbeb32c04	Makefile.vc: add missing sse41 files upsampling_sse41.c and yuv_sse41.c added in: 807b53c4 Implement the upsampling/yuv functions in SSE41 Change-Id: I186cb6f6c296ba26b8e9b42d88da7f58c55710a9	2017-12-05 23:27:16 -08:00
Vincent Rabaud	55403a9a5a	Upsampling SSE2/SSE4 speedup. RGB to YUV conversion was not using SSE to finish up the row. End data is now copied to a buffer big enough to fit in a SSE register. (UPSAMPLE_LAST_BLOCK was already using that trick). Change-Id: Ie539bcbe570a643a774aa88263503c0d2c41890f	2017-12-05 23:37:06 +01:00
Vincent Rabaud	807b53c47e	Implement the upsampling/yuv functions in SSE41 Change-Id: If122da22b74a974262063d232f6ca0ab902ff64e	2017-12-04 22:29:43 +01:00
Vincent Rabaud	84101a8165	Fix wasm WebP compilation Change-Id: I6638628fbf3b7ae310bc892c9ca49678d1098b9b	2017-12-04 13:39:50 +01:00
Pascal Massimino	8bebd2a32e	fix warning on MSVC 'function' : different 'const' qualifiers Change-Id: I855e94e8734a7e9a6156c771a7bad41b19a450d7	2017-12-01 22:46:48 -08:00
Pascal Massimino	a7f93fe32d	webpmux: allow reading argument from a file if a single text file name is supplied as argument (e.g.: 'webpmux my_long_list_of_frames.txt'), the command line arguments are actually parsed from this file. Tokenizer will remove space, tabs, LF, CR, returns, etc. + changed ImgIoUtilReadFile() to return a null-terminated data, for convenience. + misc clean-up in the code BUG=webp:355 Change-Id: I76796305641d660933de5881763d723006712fa9 ---	2017-12-01 01:42:22 -08:00
James Zern	b69f18a73a	gif2webp.1: fix -loop_compatibility layout Change-Id: Ia29b7a9e3b72605d2bb8c13ad3e37b88094444f5	2017-11-29 22:59:30 -08:00
James Zern	72d530c01d	Merge "fix lossless decoding w/WEBP_REDUCE_SIZE"	2017-11-30 06:42:20 +00:00
James Zern	296c7dc4ac	fix lossless decoding w/WEBP_REDUCE_SIZE alpha processing is still required when requesting premultiplied output since: 1b27bf8b WEBP_REDUCE_SIZE: disable all rescaler code Change-Id: Id1b03256c4c04b8db31527e60cd31dd20ce6f3ad	2017-11-29 17:01:40 -08:00
Pascal Massimino	0d5d029c18	Merge "ImgIoUtilReadFile: fix file leak upon error"	2017-11-30 00:24:51 +00:00
Pascal Massimino	ae568ce7c4	ImgIoUtilReadFile: fix file leak upon error the file was not closed in case of malloc error. Change-Id: I5f8b22d7d0da6d2c8c2dd245cdd57994e3ddea3a	2017-11-29 21:30:22 +01:00
James Zern	796b5a8a8a	libwebp-0.6.1 - 11/24/2017: version 0.6.1 This is a binary compatible release. * lossless performance and compression improvements + a new 'cruncher' mode (-m 6 -q 100) * ARM performance improvements with clang (15-20% w/ndk r15c, issue #339) * webp-js: emscripten/webassembly based javascript decoder * miscellaneous bug & build fixes (issue #329, #332, #343, #353, #360, #361, #363) Tool updates / additions: added webpinfo - prints file format information (issue #330) gif2webp - loop behavior modified to match Chrome M63+ (crbug.com/649264); '-loop_compatibility' can be used for the old behavior -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABAgAGBQJaHeXcAAoJEPnD1r24IytdGCMQAKr0uEtzi6MOWGX61y7APIst ZqKwwIppqsY8jFCkcfYcQY4iRDhdIPm4QBxZN1PYCkB9yAEHtlGeoxjdFgckGW90 DAZI2z710KvE0rA5zvqa1zxKMct2lB06AG2cYViD+I6PNzkQ1lI21TnqAC3mJ5qP t0EN32s8h6jAgbji3u+fANSNeT9iHDuqZaAJmoXVHdjQy4pB5geJrKKHN2FAQzL4 +FG8glM1CK0P5ZWsX1UljiaMdSnsiKcOb2BvVACgGx7OaBAhTOgi9GPGsRiF36Wr ddiTsbRN/RH+G4njVMq/es3lFWJCbYKozcBTdfthfrRQbgc7ZhhttNV2Qqr5fNTo qwF1VfuJs/i6c4+Uk7SXcOvwCHemPMXQyKJ7ZjZH8BZBxvvbI+fGO4xkc36F3qN8 cCm5yCFry9O6vIBG243c/y5jNJLEBnIuQgoKO6HPlUzm3qb26s7+o9Ptd4rIs7XD ikoUmcJoIXUt/brcTpJYqmGgT2Gj5Oflg0XgWdt/9tw7bQ55QStQf/YfMQ0fLtfk QnG0NtDs9GavT2Q6NnadiAmzw1o0ZJMRiAX+BHPK6di3awWArKQuRiIGZZrlmy7Q wDsuEHuClKvPof5qsZYHXh10tupYSml0bw0ZNObltCfaiimvr+BeXR4WXND9taA4 xNfJdCIQmoqIObCArbyk =deXw -----END PGP SIGNATURE----- Merge tag 'v0.6.1' libwebp-0.6.1 - 11/24/2017: version 0.6.1 This is a binary compatible release. * lossless performance and compression improvements + a new 'cruncher' mode (-m 6 -q 100) * ARM performance improvements with clang (15-20% w/ndk r15c, issue #339) * webp-js: emscripten/webassembly based javascript decoder * miscellaneous bug & build fixes (issue #329, #332, #343, #353, #360, #361, #363) Tool updates / additions: added webpinfo - prints file format information (issue #330) gif2webp - loop behavior modified to match Chrome M63+ (crbug.com/649264); '-loop_compatibility' can be used for the old behavior * tag 'v0.6.1': update ChangeLog WEBP_REDUCE_CSP: restrict colorspace support update ChangeLog vwebp: disable double buffering on windows & mac webp_to_sdl.c: fix file mode WEBP_REDUCE_SIZE: disable all rescaler code webpinfo: add -version option bump version to 0.6.1 update NEWS README: add webpinfo section Change-Id: Iab2153fae38da3c99daccdf57fec816e07b7909a	2017-11-28 15:03:15 -08:00
Pascal Massimino	1af0df7662	Merge "WEBP_REDUCE_CSP: restrict colorspace support"	2017-11-27 20:08:55 +00:00
Pascal Massimino	6de20df02c	WEBP_REDUCE_CSP: restrict colorspace support only supported ones are: RGBA/BGRA/rgbA/bgrA (decoder) as well as: WebPPictureImportRGB/RGBX/RGBA (encoder). (note: extras/get_disto is affected too) Change-Id: If6c4f95054ca15759c4e289fb3b4c352b3521c2c	2017-11-26 08:44:08 +00:00
Pascal Massimino	0df22b9eed	WEBP_REDUCE_SIZE: disable all rescaler code BUG=webp:355 Change-Id: Id87cb11902e3fb8544a214308526ea9665ce8440	2017-11-24 22:08:32 +00:00