update ChangeLog

Change-Id: I839ad7871f0bbe7a8a3a900b3176d2754eef0f6e
vwebp: fix incorrect clipping w/NO_BLEND
2015-10-23 13:20:40 -07:00 · 2015-10-23 13:10:17 -07:00 · 2015-10-20 22:44:52 -07:00 · 2015-10-19 15:41:24 -07:00 · 2015-10-19 15:41:24 -07:00 · 2015-10-19 15:41:23 -07:00
83 changed files with 3439 additions and 1470 deletions
--- a/2
+++ b/2
@ -16,9 +16,11 @@ Contributors:
 - Pascal Massimino (pascal dot massimino at gmail dot com)
 - Paweł Hajdan, Jr (phajdan dot jr at chromium dot org)
 - Pierre Joye (pierre dot php at gmail dot com)
+- Sam Clegg (sbc at chromium dot org)
 - Scott LaVarnway (slavarnway at google dot com)
 - Scott Talbot (s at chikachow dot org)
 - Slobodan Prijic (slobodan dot prijic at imgtec dot com)
 - Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Timothy Gu (timothygu99 at gmail dot com)
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
--- a/Android.mk
+++ b/Android.mk
@ -10,8 +10,6 @@ ifeq ($(APP_OPTIM),release)
  endif
 endif

-include $(CLEAR_VARS)
-
 ifneq ($(findstring armeabi-v7a, $(TARGET_ARCH_ABI)),)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
  # instructions to be generated for armv7a code. Instead target the neon code
@ -21,7 +19,7 @@ else
  NEON := c
 endif

-LOCAL_SRC_FILES := \
+dec_srcs := \
    src/dec/alpha.c \
    src/dec/buffer.c \
    src/dec/frame.c \
@ -32,18 +30,19 @@ LOCAL_SRC_FILES := \
    src/dec/vp8.c \
    src/dec/vp8l.c \
    src/dec/webp.c \
+
+demux_srcs := \
+    src/demux/demux.c \
+
+dsp_dec_srcs := \
    src/dsp/alpha_processing.c \
+    src/dsp/alpha_processing_sse2.c \
    src/dsp/cpu.c \
    src/dsp/dec.c \
    src/dsp/dec_clip_tables.c \
    src/dsp/dec_mips32.c \
    src/dsp/dec_neon.$(NEON) \
    src/dsp/dec_sse2.c \
-    src/dsp/enc.c \
-    src/dsp/enc_avx2.c \
-    src/dsp/enc_mips32.c \
-    src/dsp/enc_neon.$(NEON) \
-    src/dsp/enc_sse2.c \
    src/dsp/lossless.c \
    src/dsp/lossless_mips32.c \
    src/dsp/lossless_neon.$(NEON) \
@ -54,6 +53,15 @@ LOCAL_SRC_FILES := \
    src/dsp/yuv.c \
    src/dsp/yuv_mips32.c \
    src/dsp/yuv_sse2.c \
+
+dsp_enc_srcs := \
+    src/dsp/enc.c \
+    src/dsp/enc_avx2.c \
+    src/dsp/enc_mips32.c \
+    src/dsp/enc_neon.$(NEON) \
+    src/dsp/enc_sse2.c \
+
+enc_srcs := \
    src/enc/alpha.c \
    src/enc/analysis.c \
    src/enc/backward_references.c \
@ -74,19 +82,38 @@ LOCAL_SRC_FILES := \
    src/enc/tree.c \
    src/enc/vp8l.c \
    src/enc/webpenc.c \
+
+mux_srcs := \
+    src/mux/muxedit.c \
+    src/mux/muxinternal.c \
+    src/mux/muxread.c \
+
+utils_dec_srcs := \
    src/utils/bit_reader.c \
-    src/utils/bit_writer.c \
    src/utils/color_cache.c \
    src/utils/filters.c \
    src/utils/huffman.c \
-    src/utils/huffman_encode.c \
-    src/utils/quant_levels.c \
    src/utils/quant_levels_dec.c \
    src/utils/random.c \
    src/utils/rescaler.c \
    src/utils/thread.c \
    src/utils/utils.c \

+utils_enc_srcs := \
+    src/utils/bit_writer.c \
+    src/utils/huffman_encode.c \
+    src/utils/quant_levels.c \
+
+################################################################################
+# libwebpdecoder
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    $(dec_srcs) \
+    $(dsp_dec_srcs) \
+    $(utils_dec_srcs) \
+
 LOCAL_CFLAGS := $(WEBP_CFLAGS)
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

@ -95,6 +122,38 @@ LOCAL_ARM_MODE := arm

 LOCAL_STATIC_LIBRARIES := cpufeatures

+LOCAL_MODULE := webpdecoder_static
+
+include $(BUILD_STATIC_LIBRARY)
+
+ifeq ($(ENABLE_SHARED),1)
+include $(CLEAR_VARS)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
+
+LOCAL_MODULE := webpdecoder
+
+include $(BUILD_SHARED_LIBRARY)
+endif  # ENABLE_SHARED=1
+
+################################################################################
+# libwebp
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    $(dsp_enc_srcs) \
+    $(enc_srcs) \
+    $(utils_enc_srcs) \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_WHOLE_STATIC_LIBRARIES := webpdecoder_static
+
 LOCAL_MODULE := webp

 ifeq ($(ENABLE_SHARED),1)
@ -103,6 +162,54 @@ else
  include $(BUILD_STATIC_LIBRARY)
 endif

+################################################################################
+# libwebpdemux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(demux_srcs)
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE := webpdemux
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := webp
+  include $(BUILD_SHARED_LIBRARY)
+else
+  LOCAL_STATIC_LIBRARIES := webp
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+################################################################################
+# libwebpmux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(mux_srcs)
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+
+# prefer arm over thumb mode for performance gains
+LOCAL_ARM_MODE := arm
+
+LOCAL_MODULE := webpmux
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := webp
+  include $(BUILD_SHARED_LIBRARY)
+else
+  LOCAL_STATIC_LIBRARIES := webp
+  include $(BUILD_STATIC_LIBRARY)
+endif
+
+################################################################################
+
 include $(LOCAL_PATH)/examples/Android.mk

 $(call import-module,android/cpufeatures)
--- a/122
+++ b/122
@ -1,3 +1,125 @@
+46e18c0 vwebp: fix incorrect clipping w/NO_BLEND
+fcfde90 update issue tracker url
+8c3fb33 update AUTHORS
+808d4a6 update NEWS
+6286404 bump version to 0.4.4
+b8b314a doc/webp-container-spec: update repo browser link
+c3953e3 fix typo: constitutes -> constitute
+cd377e2 Use __has_builtin to check clang support
+e2e8980 wicdec: fix alpha detection w/64bpp BGRA/RGBA
+5c3fe77 iosbuild: fix linking with Xcode 7 / iOS SDK 9
+f9f5498 VP8LAllocateHistogramSet: align histogram[] entries
+3026db2 Loosen the buffer size checks for Y/U/V/A too.
+d089362 loosen the padding check on buffer size
+53d22c5 dec_neon: add whitespace around stringizing operator
+8bcc4d4 dsp/mips: add whitespace around stringizing operator
+d49c44f Container spec: clarify ordering of ALPH chunk.
+382de22 msvc: fix pointer type warning in BitsLog2Floor
+84ecd9d FlattenSimilarBlocks should only be tried when blending is possible.
+f55ebbb backport rescaler fix
+2ff633c fix mips2 build target
+326b5fb update ChangeLog (tag: v0.4.3, origin/0.4.3, 0.4.3)
+a661e50 Disable NEON code on Native Client
+fcd94e9 update ChangeLog (tag: v0.4.3-rc1)
+569fe57 update NEWS
+bd852f5 bump version to 0.4.3
+2d58b64 WebPPictureRescale: add a note about 0 width/height
+a0d8ca5 examples/Android.mk: add webpmux_example target
+34b1d29 Android.mk: add webpmux target
+7561988 Android.mk: add webpdemux target
+a987576 Android.mk: add webpdecoder{,_static} targets
+a6d4859 Android.mk: split source lists per-directory
+77544d5 fix iOS arm64 build with Xcode 6.3
+6dea157 doc/webp-container-spec: note MSB order for chunk diagrams
+f7cd57b doc/webp-container-spec: cosmetics
+1d6b250 vwebp: clear canvas at the beginning of each loop
+f97b3f8 webp-container-spec: clarify background clear on loop
+4ba83c1 vwebp: remove unnecessary static Help() prototype
+d34e8e3 vwebp/animation: display last frame on end-of-loop
+bbbc524 dec/vp8: clear 'dither_' on skipped blocks
+0339fa2 lossless_neon: enable subtract green for aarch64
+5a0c220 Regression fix for lossless decoding
+6e3a31d wicdec: (msvs) quiet some /analyze warnings
+b49a578 dwebp/WritePNG: mark png variables volatile
+0a4391a dwebp: include setjmp.h w/WEBP_HAVE_PNG
+90f1ec5 dwebp: correct sign in format strings
+b61ce86 VP8LEncodeStream: add an assert
+df1081b dsp/cpu: (msvs) add include for __cpuidex
+39aa055 dsp/cpu: (msvs) avoid immintrin.h on _M_ARM
+f814f42 dsp/cpu: add include for _xgetbv() w/MSVS
+8508ab9 cpu: fix AVX2 detection for gcc/clang targets
+5769623 fix handling of zero-sized partition #0 corner case
+b2e71a9 make the 'last_cpuinfo_used' variable names unique
+1273e84 add -Wformat-nonliteral and -Wformat-security
+3ae78eb multi-thread fix: lock each entry points with a static var
+5c1eeda webp-container-spec: remove references to fragments
+c5ceea4 enc_neon: fix building with non-Xcode clang (iOS)
+d0859d6 iosbuild: add x64_64 simulator support
+046732c WebPEncode: Support encoding same pic twice (even if modified)
+4426f50 webp/types.h: use inline for clang++/-std=c++11
+e297fc7 gif2webp: Use the default hint instead of WEBP_HINT_GRAPH.
+855fe43 Makefile.vc: add a 'legacy' RTLIBCFG option
+b7eb6d5 gif2webp: Support GIF_DISPOSE_RESTORE_PREVIOUS
+5691bdd gif2webp: Handle frames with odd offsets + disposal to background.
+8301da1 stopwatch.h: fix includes
+6a2209a update ChangeLog (tag: v0.4.2, origin/0.4.2, 0.4.2)
+36cad6a bit_reader.h: cosmetics: fix a typo
+e2ecae6 enc_mips32: workaround gcc-4.9 bug
+243e68d update ChangeLog (tag: v0.4.2-rc2)
+eec5f5f enc/vp8enci.h: update version number
+0c1b98d update NEWS
+69b0fc9 update AUTHORS
+857578a bump version to 0.4.2
+9129deb restore encode API compatibility
+f17b95e AssignSegments: quiet -Warray-bounds warning
+9c56c8a enc_neon: initialize vectors w/vdup_n_u32
+a008902 iosbuild: cleanup
+cc6de53 iosbuild: output autoconf req. on failure
+740d765 iosbuild: make iOS 6 the minimum requirement
+403023f iobuild.sh: only install .h files in Headers
+b65727b Premultiply with alpha during U/V downsampling
+8de0deb gif2webp: Background color correction
+f8b7d94 Amend the lossless spec according to issue #205, #206 and #224
+9102a7b Add a WebPExtractAlpha function to dsp
+e407b5d webpmux: simplify InitializeConfig()
+3e70e64 webpmux: fix indent
+be38f1a webpmux: fix exit status on numeric value parse error
+94dadcb webpmux: fix loop_count range check
+40b3a61 examples: warn on invalid numeric parameters
+b7d209a gif2webp: Handle frames with missing  graphic control extension
+bf0eb74 configure: simplify libpng-config invocation
+3740f7d Rectify bug in lossless incremental decoding.
+3ab0a37 make VP8LSetBitPos() set br->eos_ flag
+2e4312b Lossless decoding: fix eos_ flag condition
+e6609ac fix erroneous dec->status_ setting
+5692eae add a fallback to ALPHA_NO_COMPRESSION
+6ecd5bf ExUtilReadFromStdin: (windows) open stdin in bin mode
+4206ac6 webpmux: (windows) open stdout in binary mode
+d40e885 cwebp: (windows) open stdout in binary mode
+4aaf463 example_util: add ExUtilSetBinaryMode
+4c82ff7 webpmux man page: Clarify some title, descriptions and examples
+23d4fb3 dsp/lossless: workaround gcc-4.9 bug on arm
+5af7719 dsp.h: collect gcc/clang version test macros
+90d1124 enc_neon: enable QuantizeBlock for aarch64
+ee78e78 SmartRGBYUV: fix odd-width problem with pixel replication
+c9ac204 fix some MSVC64 warning about float conversion
+f4497a1 cpu: check for _MSC_VER before using msvc inline asm
+e2159fd faster RGB->YUV conversion function (~7% speedup)
+21abaa0 Add smart RGB->YUV conversion option -pre 4
+1a161e2 configure: add work around for gcc-4.9 aarch64 bug
+55b10de MIPS: mips32r2: added optimization for BSwap32
+76d2192 Update PATENTS to reflect s/VP8/WebM/g
+29a9db1 MIPS: detect mips32r6 and disable mips32r1 code
+245c4a6 Correctly use the AC_CANONICAL_* macros
+40aa8b6 cosmetics
+2ddcca5 cosmetics: remove some extraneous 'extern's
+f40dd7c vp8enci.h: cosmetics: fix '*' placement
+4610c9c bit_writer: cosmetics: rename kFlush() -> Flush()
+fc3c175 dsp: detect mips64 & disable mips32 code
+c1a7955 cwebp.1: restore quality description
+57a7e73 correct alpha_dithering_strength ABI check
+6c83157 correct WebPMemoryWriterClear ABI check
+8af2771 update ChangeLog (tag: v0.4.1, origin/0.4.1, 0.4.1)
 f59c0b4 iosbuild.sh: specify optimization flags
 8d34ea3 update ChangeLog (tag: v0.4.1-rc1)
 dbc3da6 makefile.unix: add vwebp.1 to the dist target
--- a/Makefile.vc
+++ b/Makefile.vc
@ -27,7 +27,7 @@ PLATFORM_LDFLAGS = /SAFESEH
 NOLOGO     = /nologo
 CCNODBG    = cl.exe $(NOLOGO) /O2 /DNDEBUG
 CCDEBUG    = cl.exe $(NOLOGO) /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c /GS
+CFLAGS     = /Isrc $(NOLOGO) /W3 /EHsc /c
 CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
 CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
 LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE
@ -54,6 +54,11 @@ AVX2_FLAGS = /arch:AVX2
 !IF "$(RTLIBCFG)" == "static"
 RTLIB  = /MT
 RTLIBD = /MTd
+!ELSE IF "$(RTLIBCFG)" == "legacy"
+RTLIBCFG = static
+RTLIB  = /MT
+RTLIBD = /MTd
+CFLAGS = $(CFLAGS) /GS- /arch:IA32
 !ELSE
 RTLIB   = /MD
 RTLIBD  = /MDd
@ -139,6 +144,7 @@ CFGSET = TRUE
 !MESSAGE - all                            - build (de)mux-based targets for CFG
 !MESSAGE
 !MESSAGE RTLIBCFG controls the runtime library linkage - 'static' or 'dynamic'.
+!MESSAGE   'legacy' will produce a Windows 2000 compatible library.
 !MESSAGE OBJDIR is the path where you like to build (obj, bins, etc.),
 !MESSAGE   defaults to ..\obj

@ -172,6 +178,7 @@ DEMUX_OBJS = \

 DSP_DEC_OBJS = \
    $(DIROBJ)\dsp\alpha_processing.obj \
+    $(DIROBJ)\dsp\alpha_processing_sse2.obj \
    $(DIROBJ)\dsp\cpu.obj \
    $(DIROBJ)\dsp\dec.obj \
    $(DIROBJ)\dsp\dec_clip_tables.obj \
--- a/22
+++ b/22
@ -1,3 +1,25 @@
+- 10/15/15: version 0.4.4
+  This is a binary compatible release.
+  * rescaling out-of-bounds read fix (issue #254)
+  * various build fixes and improvements (issues #253, #259, #262, #267, #268)
+  * container documentation update
+  * gif2webp transparency fix (issue #245)
+
+- 3/3/15: version 0.4.3
+  This is a binary compatible release.
+  * Android / gcc / iOS / MSVS build fixes and improvements
+  * lossless decode fix (issue #239 -- since 0.4.0)
+  * documentation / vwebp updates for animation
+  * multi-threading fix (issue #234)
+
+- 10/13/14: version 0.4.2
+  This is a binary compatible release.
+  * Android / gcc build fixes
+  * (Windows) fix reading from stdin and writing to stdout
+  * gif2webp: miscellaneous fixes
+  * fix 'alpha-leak' with lossy compression (issue #220)
+  * the lossless bitstream spec has been amended to reflect the current code
+
 - 7/24/14: version 0.4.1
  This is a binary compatible release.
  * AArch64 (arm64) & MIPS support/optimizations
--- a/39
+++ b/39
@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
+------------------------------------

-"This implementation" means the copyrightable works distributed by
-Google as part of the WebM Project.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.

-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of VP8, where such license applies only to those patent
-claims, both currently owned by Google and acquired in the future,
-licensable by Google that are necessarily infringed by this
-implementation of VP8. This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation. If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of VP8 or any code incorporated within this
-implementation of VP8 constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of VP8
-shall terminate as of the date such litigation is filed.
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
--- a/4
+++ b/4
@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.1
+               \____/____/\_____/_____/____/v0.4.4

 Description:
 ============
@ -596,7 +596,7 @@ Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/README.mux
+++ b/README.mux
@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.1
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.2


 Description:
@ -175,7 +175,7 @@ Bugs:
 =====

 Please report all bugs to our issue tracker:
-    http://code.google.com/p/webp/issues
+    https://bugs.chromium.org/p/webp
 Patches welcome! See this page to get started:
    http://www.webmproject.org/code/contribute/submitting-patches/

--- a/configure.ac
+++ b/configure.ac
@ -1,7 +1,7 @@
-AC_INIT([libwebp], [0.4.1],
-        [http://code.google.com/p/webp/issues],,
+AC_INIT([libwebp], [0.4.4],
+        [https://bugs.chromium.org/p/webp],,
        [http://developers.google.com/speed/webp])
-AC_CANONICAL_TARGET
+AC_CANONICAL_HOST
 AC_PREREQ([2.60])
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])

@ -54,6 +54,7 @@ AC_DEFUN([TEST_AND_ADD_CFLAGS],
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wall])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wdeclaration-after-statement])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wextra])
+TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-nonliteral])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wformat-security])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-declarations])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wmissing-prototypes])
@ -62,6 +63,19 @@ TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wshadow])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused-but-set-variable])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wunused])
 TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-Wvla])
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62040
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61622
+AS_IF([test "$GCC" = "yes" ], [
+       gcc_version=`$CC -dumpversion`
+       gcc_wht_bug=""
+       case "$host_cpu" in
+         aarch64|arm64)
+          case "$gcc_version" in
+            4.9|4.9.0|4.9.1) gcc_wht_bug=yes ;;
+          esac
+       esac
+       AS_IF([test "$gcc_wht_bug" = "yes"], [
+              TEST_AND_ADD_CFLAGS([AM_CFLAGS], [-frename-registers])])])
 AC_SUBST([AM_CFLAGS])

 dnl === Check for machine specific flags
@ -283,15 +297,12 @@ AC_ARG_ENABLE([png], AS_HELP_STRING([--disable-png],
                                     @<:@default=auto@:>@]))
 AS_IF([test "x$enable_png" != "xno"], [
  CLEAR_LIBVARS([PNG])
-  AC_PATH_PROGS(LIBPNG_CONFIG,
-                [libpng-config libpng15-config libpng14-config libpng12-config])
+  AC_PATH_PROGS([LIBPNG_CONFIG],
+                [libpng-config libpng16-config libpng15-config libpng14-config \
+                 libpng12-config])
  if test -n "$LIBPNG_CONFIG"; then
    PNG_INCLUDES=`$LIBPNG_CONFIG --cflags`
-    PNG_PREFIX=`$LIBPNG_CONFIG --prefix`
-    if test "${PNG_PREFIX}/lib" != "/usr/lib" ; then
-      PNG_LIBS="-L${PNG_PREFIX}/lib"
-    fi
-    PNG_LIBS="$PNG_LIBS `$LIBPNG_CONFIG --libs`"
+    PNG_LIBS="`$LIBPNG_CONFIG --ldflags`"
  fi

  WITHLIB_OPTION([png], [PNG])
@ -409,8 +420,9 @@ AC_ARG_ENABLE([wic],
                              @<:@default=auto@:>@]),,
              [enable_wic=yes])

-if test \( "$target_os" = "mingw32" -o "$target_os" = "mingw64" \) \
-        -a "$enable_wic" = "yes"; then
+case $host_os in
+mingw*)
+if test "$enable_wic" = "yes"; then
  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
  if test "$ac_cv_header_wincodec_h" = "yes"; then
    AC_MSG_CHECKING(for Windows Imaging Component support)
@ -450,6 +462,7 @@ if test \( "$target_os" = "mingw32" -o "$target_os" = "mingw64" \) \
    AC_MSG_RESULT(${wic_support-no})
  fi
 fi
+esac

 dnl === If --enable-aligned is defined, define WEBP_FORCE_ALIGNED

--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@ -46,25 +46,16 @@ for:
  * **Animation.** An image may have multiple frames with pauses between them,
    making it an animation.

-  * **Image Fragmentation.** A single bitstream in WebP has an inherent
-    limitation for width or height of 2^14 pixels, and, when using VP8, a 512
-    KiB limit on the size of the first compressed partition. To support larger
-    images, the format supports images that are composed of multiple fragments,
-    each encoded as a separate bitstream. All fragments logically form a single
-    image: they have common metadata, color profile, etc. Image fragmentation
-    may also improve efficiency for larger images, e.g., grass can be encoded
-    differently than sky.
-
 The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
 "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
 document are to be interpreted as described in [RFC 2119][].

+Bit numbering in chunk diagrams starts at `0` for the most significant bit
+('MSB 0') as described in [RFC 1166][].
+
 **Note:** Out of the features mentioned above, lossy compression, lossless
 compression, transparency, metadata, color profile and animation are finalized
-and are to be considered stable. On the other hand, image fragmentation is
-experimental as of now, and is open to discussion, feedback and comments.
-The same is indicated using annotation "_status: experimental_" in the relevant
-sections of this document.
+and are to be considered stable.

 Terminology &amp; Basics
 ------------------------
@ -79,7 +70,7 @@ Below are additional terms used throughout this document:
 _Reader/Writer_

 : Code that reads WebP files is referred to as a _reader_, while code that
-writes them is referred to as a _writer_.
+  writes them is referred to as a _writer_.

 _uint16_

@ -101,10 +92,12 @@ _FourCC_
 _1-based_

 : An unsigned integer field storing values offset by `-1`. e.g., Such a field
-would store value _25_ as _24_.
+  would store value _25_ as _24_.

-RIFF file format
+
+RIFF File Format
 ----------------
+
 The WebP file format is based on the RIFF (resource interchange file format)
 document format.

@ -144,7 +137,8 @@ _ChunkHeader('ABCD')_
 chunks that apply to any RIFF file format, while FourCCs specific to a file
 format are all lowercase. WebP does not follow this convention.

-WebP file header
+
+WebP File Header
 ----------------

     0                   1                   2                   3
@ -164,8 +158,8 @@ WebP file header
 File Size: 32 bits (_uint32_)

 : The size of the file in bytes starting at offset 8. The maximum value of
-this field is 2^32 minus 10 bytes and thus the size of the whole file is at
-most 4GiB minus 2 bytes.
+  this field is 2^32 minus 10 bytes and thus the size of the whole file is at
+  most 4GiB minus 2 bytes.

 'WEBP': 32 bits

@ -177,7 +171,8 @@ the 'WEBP' FourCC. The file SHOULD NOT contain anything after it. As the size
 of any chunk is even, the size given by the RIFF header is also even. The
 contents of individual chunks will be described in the following sections.

-Simple file format (lossy)
+
+Simple File Format (Lossy)
 --------------------------

 This layout SHOULD be used if the image requires _lossy_ encoding and does not
@ -215,7 +210,8 @@ width and height. That is assumed to be the width and height of the canvas.
 The VP8 specification describes how to decode the image into Y'CbCr
 format. To convert to RGB, Rec. 601 SHOULD be used.

-Simple file format (lossless)
+
+Simple File Format (Lossless)
 -----------------------------

 **Note:** Older readers may not support files using the lossless format.
@ -253,7 +249,8 @@ The current specification of the VP8L bitstream can be found at
 contains the VP8L image width and height. That is assumed to be the width
 and height of the canvas.

-Extended file format
+
+Extended File Format
 --------------------

 **Note:** Older readers may not support files using the extended format.
@ -274,13 +271,15 @@ An extended format file consists of:

  * An optional list of [unknown chunks](#unknown-chunks). _\[status: experimental\]_

-For a _still image_, the _image data_ consists of a single frame, whereas for
-an _animated image_, it consists of multiple frames. More details about frames
-can be found in the [Animation](#animation) section.
+For a _still image_, the _image data_ consists of a single frame, which is made
+up of:

-Moreover, each frame can be fragmented or non-fragmented, as will be described
-in the [Extended WebP file header](#extended_header) section. More details about
-fragments can be found in the [Fragments](#fragments) section.
+  * An optional [alpha subchunk](#alpha).
+
+  * A [bitstream subchunk](#bitstream-vp8vp8l).
+
+For an _animated image_, the _image data_ consists of multiple frames. More
+details about frames can be found in the [Animation](#animation) section.

 All chunks SHOULD be placed in the same order as listed above. If a chunk
 appears in the wrong place, the file is invalid, but readers MAY parse the
@ -302,7 +301,7 @@ Extended WebP file header:
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                      ChunkHeader('VP8X')                      |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |Rsv|I|L|E|X|A|F|                   Reserved                    |
+    |Rsv|I|L|E|X|A|R|                   Reserved                    |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |          Canvas Width Minus One               |             ...
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@ -320,7 +319,7 @@ ICC profile (I): 1 bit
 Alpha (L): 1 bit

 : Set if any of the frames of the image contain transparency information
-("alpha").
+  ("alpha").

 EXIF metadata (E): 1 bit

@ -333,11 +332,11 @@ XMP metadata (X): 1 bit
 Animation (A): 1 bit

 : Set if this is an animated image. Data in 'ANIM' and 'ANMF' chunks should be
-used to control the animation.
+  used to control the animation.

-Image Fragmentation (F): 1 bit _\[status: experimental\]_
+Reserved (R): 1 bit

-: Set if any of the frames in the image are represented by fragments.
+: SHOULD be `0`.

 Reserved: 24 bits

@ -382,9 +381,9 @@ animation.
 Background Color: 32 bits (_uint32_)

 : The default background color of the canvas in \[Blue, Green, Red, Alpha\]
-byte order. This color MAY be used to fill the unused space on the canvas around
-the frames, as well as the transparent pixels of the first frame. Background
-color is also used when disposal method is `1`.
+  byte order. This color MAY be used to fill the unused space on the canvas
+  around the frames, as well as the transparent pixels of the first frame.
+  Background color is also used when disposal method is `1`.

 **Note**:

@ -394,6 +393,9 @@ color is also used when disposal method is `1`.
  * Viewer applications SHOULD treat the background color value as a hint, and
    are not required to use it.

+  * The canvas is cleared at the start of each loop. The background color MAY be
+    used to achieve this.
+
 Loop Count: 16 bits (_uint16_)

 : The number of times to loop the animation. `0` means infinitely.
@ -402,7 +404,6 @@ This chunk MUST appear if the _Animation_ flag in the VP8X chunk is set.
 If the _Animation_ flag is not set and this chunk is present, it
 SHOULD be ignored.

-
 ANMF chunk:

 For animated images, this chunk contains information about a _single_ frame.
@ -445,8 +446,8 @@ Frame Height Minus One: 24 bits (_uint24_)
 Frame Duration: 24 bits (_uint24_)

 : The time to wait before displaying the next frame, in 1 millisecond units.
-In particular, frame duration of 0 is useful when one wants to update multiple
-areas of the canvas at once during the animation.
+  In particular, frame duration of 0 is useful when one wants to update
+  multiple areas of the canvas at once during the animation.

 Reserved: 6 bits

@ -454,28 +455,28 @@ Reserved: 6 bits

 Blending method (B): 1 bit

-: Indicates how transparent pixels of _the current frame_ are to be blended with
-corresponding pixels of the previous canvas:
+: Indicates how transparent pixels of _the current frame_ are to be blended
+  with corresponding pixels of the previous canvas:

-  * `0`: Use alpha blending. After disposing of the previous frame, render the
-    current frame on the canvas using [alpha-blending](#alpha-blending). If the
-    current frame does not have an alpha channel, assume alpha value of 255,
-    effectively replacing the rectangle.
+    * `0`: Use alpha blending. After disposing of the previous frame, render the
+      current frame on the canvas using [alpha-blending](#alpha-blending). If
+      the current frame does not have an alpha channel, assume alpha value of
+      255, effectively replacing the rectangle.

-  * `1`: Do not blend. After disposing of the previous frame, render the
-    current frame on the canvas by overwriting the rectangle covered by the
-    current frame.
+    * `1`: Do not blend. After disposing of the previous frame, render the
+      current frame on the canvas by overwriting the rectangle covered by the
+      current frame.

 Disposal method (D): 1 bit

-: Indicates how _the current frame_ is to be treated after it has been displayed
-(before rendering the next frame) on the canvas:
+: Indicates how _the current frame_ is to be treated after it has been
+  displayed (before rendering the next frame) on the canvas:

-  * `0`: Do not dispose. Leave the canvas as is.
+    * `0`: Do not dispose. Leave the canvas as is.

-  * `1`: Dispose to background color. Fill the _rectangle_ on the canvas covered
-    by the _current frame_ with background color specified in the
-    [ANIM chunk](#anim_chunk).
+    * `1`: Dispose to background color. Fill the _rectangle_ on the canvas
+      covered by the _current frame_ with background color specified in the
+      [ANIM chunk](#anim_chunk).

 **Notes**:

@ -506,9 +507,7 @@ Disposal method (D): 1 bit

 Frame Data: _Chunk Size_ - `16` bytes

-: For a fragmented frame, it consists of multiple [fragment chunks](#fragments).
-
-: For a non-fragmented frame, it consists of:
+: Consists of:

  * An optional [alpha subchunk](#alpha) for the frame.

@ -519,49 +518,6 @@ Frame Data: _Chunk Size_ - `16` bytes
 **Note**: The 'ANMF' payload, _Frame Data_ above, consists of individual
 _padded_ chunks as described by the [RIFF file format](#riff-file-format).

-#### Fragments _\[status: experimental\]_
-
-For images that are represented by fragments, this chunk contains data for
-a single fragment. If the _Image Fragmentation Flag_ is not set, then this chunk
-SHOULD NOT be present.
-
-     0                   1                   2                   3
-     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                      ChunkHeader('FRGM')                      |
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                  Fragment X                   |             ...
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    ...       Fragment Y            |         Fragment Data         |
-    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-
-Fragment X: 24 bits (_uint24_)
-
-: The X coordinate of the upper left corner of the fragment is `Fragment X * 2`
-
-Fragment Y: 24 bits (_uint24_)
-
-: The Y coordinate of the upper left corner of the fragment is `Fragment Y * 2`
-
-Fragment Data: _Chunk Size_ - `6` bytes
-
-: It contains:
-
-  * An optional [alpha subchunk](#alpha) for the fragment.
-  * The [bitstream subchunk](#bitstream-vp8vp8l) for the fragment.
-  * An optional list of [unknown chunks](#unknown-chunks).
-
-Note: The width and height of the fragment is obtained from the bitstream
-subchunk.
-
-The fragments of a frame SHOULD have the following properties:
-
-  * They collectively cover the whole frame.
-
-  * No pair of fragments have any overlapping region on the frame.
-
-  * No portion of any fragment should be located outside of the canvas.
-
 #### Alpha

     0                   1                   2                   3
@ -579,20 +535,20 @@ Reserved (Rsv): 2 bits
 Pre-processing (P): 2 bits

 : These INFORMATIVE bits are used to signal the pre-processing that has
-been performed during compression. The decoder can use this information to
-e.g. dither the values or smooth the gradients prior to display.
+  been performed during compression. The decoder can use this information to
+  e.g. dither the values or smooth the gradients prior to display.

-  * `0`: no pre-processing
-  * `1`: level reduction
+    * `0`: no pre-processing
+    * `1`: level reduction

 Filtering method (F): 2 bits

 : The filtering method used:

-  * `0`: None.
-  * `1`: Horizontal filter.
-  * `2`: Vertical filter.
-  * `3`: Gradient filter.
+    * `0`: None.
+    * `1`: Horizontal filter.
+    * `2`: Vertical filter.
+    * `3`: Gradient filter.

 For each pixel, filtering is performed using the following calculations.
 Assume the alpha values surrounding the current `X` position are labeled as:
@ -636,15 +592,15 @@ Compression method (C): 2 bits

 : The compression method used:

-  * `0`: No compression.
-  * `1`: Compressed using the WebP lossless format.
+    * `0`: No compression.
+    * `1`: Compressed using the WebP lossless format.

 Alpha bitstream: _Chunk Size_ - `1` bytes

 : Encoded alpha bitstream.

-This optional chunk contains encoded alpha data for this frame/fragment. A
-frame/fragment containing a 'VP8L' chunk SHOULD NOT contain this chunk.
+This optional chunk contains encoded alpha data for this frame. A frame
+containing a 'VP8L' chunk SHOULD NOT contain this chunk.

 **Rationale**: The transparency information is already part of the 'VP8L'
 chunk.
@ -675,15 +631,15 @@ compression method is '0') or compressed using the lossless format

 #### Bitstream (VP8/VP8L)

-This chunk contains compressed bitstream data for a single frame/fragment.
+This chunk contains compressed bitstream data for a single frame.

 A bitstream chunk may be either (i) a VP8 chunk, using "VP8 " (note the
 significant fourth-character space) as its tag _or_ (ii) a VP8L chunk, using
 "VP8L" as its tag.

 The formats of VP8 and VP8L chunks are as described in sections
-[Simple file format (lossy)](#simple-file-format-lossy)
-and [Simple file format (lossless)](#simple-file-format-lossless) respectively.
+[Simple File Format (Lossy)](#simple-file-format-lossy)
+and [Simple File Format (Lossless)](#simple-file-format-lossless) respectively.

 #### Color profile

@ -731,7 +687,6 @@ EXIF Metadata: _Chunk Size_ bytes

 : image metadata in EXIF format.

-
 XMP chunk:

     0                   1                   2                   3
@ -762,47 +717,17 @@ A file MAY contain unknown chunks:

  * At the end of the file as described in [Extended WebP file
    header](#extended_header) section.
-  * At the end of FRGM and ANMF chunks as described in [Fragments](#fragments)
-    and [Animation](#animation) sections.
+  * At the end of ANMF chunks as described in the
+    [Animation](#animation) section.

 Readers SHOULD ignore these chunks. Writers SHOULD preserve them in their
 original order (unless they specifically intend to modify these chunks).

-### Assembling the Canvas from fragments/frames
+### Assembling the Canvas from frames

-Here we provide an overview of how a reader should assemble a canvas in case
-of a fragmented-image and in case of an animated image. The notation
-_VP8X.field_ means the field in the 'VP8X' chunk with the same description.
-
-Displaying a _fragmented image_ canvas MUST be equivalent to the following
-pseudocode: _\[status: experimental\]_
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-assert VP8X.flags.hasFragments
-canvas ← new black image of size VP8X.canvasWidth x VP8X.canvasHeight.
-frgm_params ← nil
-for chunk in image_data:
-    assert chunk.tag is "FRGM"
-    frgm_params.fragmentX = Fragment X
-    frgm_params.fragmentY = Fragment Y
-    for subchunk in 'Fragment Data':
-        if subchunk.tag == "ALPH":
-            assert alpha subchunks not found in 'Fragment Data' earlier
-            frgm_params.alpha = alpha_data
-        else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
-            assert bitstream subchunks not found in 'Fragment Data' earlier
-            frgm_params.bitstream = bitstream_data
-    frgm_params.fragmentWidth = Width extracted from bitstream subchunk
-    frgm_params.fragmentHeight = Height extracted from bitstream subchunk
-    assert VP8X.canvasWidth >=
-        frgm_params.fragmentX + frgm_params.fragmentWidth
-    assert VP8X.canvasHeight >=
-        frgm_params.fragmentY + frgm_params.fragmentHeight
-    assert fragment has the properties mentioned in "Image Fragments" section.
-    render fragment with frame_params.alpha and frame_params.bitstream on canvas
-    with top-left corner in (frgm_params.fragmentX, frgm_params.fragmentY).
-canvas contains the decoded canvas.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Here we provide an overview of how a reader should assemble a canvas in the
+case of an animated image. The notation _VP8X.field_ means the field in the
+'VP8X' chunk with the same description.

 Displaying an _animated image_ canvas MUST be equivalent to the following
 pseudocode:
@ -810,28 +735,25 @@ pseudocode:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 assert VP8X.flags.hasAnimation
 canvas ← new image of size VP8X.canvasWidth x VP8X.canvasHeight with
-background color ANIM.background_color.
+         background color ANIM.background_color.
 loop_count ← ANIM.loopCount
 dispose_method ← ANIM.disposeMethod
 if loop_count == 0:
    loop_count = ∞
 frame_params ← nil
-for loop = 0, ..., loop_count - 1
-    assert next chunk in image_data is ANMF
-    frame_params.frameX = Frame X
-    frame_params.frameY = Frame Y
-    frame_params.frameWidth = Frame Width Minus One + 1
-    frame_params.frameHeight = Frame Height Minus One + 1
-    frame_params.frameDuration = Frame Duration
-    assert VP8X.canvasWidth >= frame_params.frameX + frame_params.frameWidth
-    assert VP8X.canvasHeight >= frame_params.frameY + frame_params.frameHeight
-    if VP8X.flags.hasFragments and first subchunk in 'Frame Data' is FRGM
-        // Fragmented frame.
-        frame_params.{bitstream,alpha} = canvas decoded from subchunks in
-                                         'Frame Data' as per the pseudocode for
-                                         _fragmented image_ above.
-    else
-        // Non-fragmented frame.
+assert next chunk in image_data is ANMF
+for loop = 0..loop_count - 1
+    clear canvas to ANIM.background_color or application defined color
+    until eof or non-ANMF chunk
+        frame_params.frameX = Frame X
+        frame_params.frameY = Frame Y
+        frame_params.frameWidth = Frame Width Minus One + 1
+        frame_params.frameHeight = Frame Height Minus One + 1
+        frame_params.frameDuration = Frame Duration
+        frame_right = frame_params.frameX + frame_params.frameWidth
+        frame_bottom = frame_params.frameY + frame_params.frameHeight
+        assert VP8X.canvasWidth >= frame_right
+        assert VP8X.canvasHeight >= frame_bottom
        for subchunk in 'Frame Data':
            if subchunk.tag == "ALPH":
                assert alpha subchunks not found in 'Frame Data' earlier
@ -839,14 +761,15 @@ for loop = 0, ..., loop_count - 1
            else if subchunk.tag == "VP8 " OR subchunk.tag == "VP8L":
                assert bitstream subchunks not found in 'Frame Data' earlier
                frame_params.bitstream = bitstream_data
-    render frame with frame_params.alpha and frame_params.bitstream on canvas
-    with top-left corner in (frame_params.frameX, frame_params.frameY), using
-    dispose method dispose_method.
-    Show the contents of the image for frame_params.frameDuration * 1ms.
-canvas contains the decoded canvas.
+        render frame with frame_params.alpha and frame_params.bitstream on
+            canvas with top-left corner at (frame_params.frameX,
+            frame_params.frameY), using dispose method dispose_method.
+        canvas contains the decoded image.
+        Show the contents of the canvas for frame_params.frameDuration * 1ms.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Example file layouts
+
+Example File Layouts
 --------------------

 A lossy encoded image with alpha may look as follows:
@ -878,17 +801,6 @@ RIFF/WEBP
 +- XMP  (metadata)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-A fragmented image may look as follows:
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-RIFF/WEBP
-+- VP8X (descriptions of features used)
-+- FRGM (fragment1 parameters + data)
-+- FRGM (fragment2 parameters + data)
-+- FRGM (fragment3 parameters + data)
-+- FRGM (fragment4 parameters + data)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
 An animated image with EXIF metadata may look as follows:

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -903,7 +815,8 @@ RIFF/WEBP
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 [vp8spec]:  http://tools.ietf.org/html/rfc6386
-[webpllspec]: https://gerrit.chromium.org/gerrit/gitweb?p=webm/libwebp.git;a=blob;f=doc/webp-lossless-bitstream-spec.txt;hb=master
+[webpllspec]: https://chromium.googlesource.com/webm/libwebp/+/master/doc/webp-lossless-bitstream-spec.txt
 [iccspec]: http://www.color.org/icc_specs2.xalter
 [metadata]: http://www.metadataworkinggroup.org/pdf/mwg_guidance.pdf
+[rfc 1166]: http://tools.ietf.org/html/rfc1166
 [rfc 2119]: http://tools.ietf.org/html/rfc2119
--- a/doc/webp-lossless-bitstream-spec.txt
+++ b/doc/webp-lossless-bitstream-spec.txt
@ -14,6 +14,7 @@ Specification for WebP Lossless Bitstream

 _Jyrki Alakuijala, Ph.D., Google, Inc., 2012-06-19_

+Paragraphs marked as \[AMENDED\] were amended on 2014-09-16.

 Abstract
 --------
@ -172,8 +173,8 @@ It should be set to 0 when all alpha values are 255 in the picture, and
 int alpha_is_used = ReadBits(1);
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The version_number is a 3 bit code that must be discarded by the decoder
-at this time. Complying encoders write a 3-bit value 0.
+The version_number is a 3 bit code that must be set to 0. Any other value
+should be treated as an error. \[AMENDED\]

 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 int version_number = ReadBits(3);
@ -330,7 +331,7 @@ uint32 Select(uint32 L, uint32 T, uint32 TL) {
           abs(pGreen - GREEN(T)) + abs(pBlue - BLUE(T));

  // Return either left or top, the one closer to the prediction.
-  if (pL <= pT) {
+  if (pL < pT) {     // \[AMENDED\]
    return L;
  } else {
    return T;
@ -542,6 +543,9 @@ color.
 argb = color_table[GREEN(argb)];
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+If the index is equal or larger than color_table_size, the argb color value
+should be set to 0x00000000 (transparent black).  \[AMENDED\]
+
 When the color table is small (equal to or less than 16 colors), several
 pixels are bundled into a single pixel. The pixel bundling packs several
 (2, 4, or 8) pixels into a single pixel, reducing the image width
--- a/examples/Android.mk
+++ b/examples/Android.mk
@ -1,5 +1,8 @@
 LOCAL_PATH := $(call my-dir)

+################################################################################
+# libexample_util
+
 include $(CLEAR_VARS)

 LOCAL_SRC_FILES := \
@ -12,6 +15,9 @@ LOCAL_MODULE := example_util

 include $(BUILD_STATIC_LIBRARY)

+################################################################################
+# cwebp
+
 include $(CLEAR_VARS)

 # Note: to enable jpeg/png encoding the sources from AOSP can be used with
@ -32,6 +38,9 @@ LOCAL_MODULE := cwebp

 include $(BUILD_EXECUTABLE)

+################################################################################
+# dwebp
+
 include $(CLEAR_VARS)

 LOCAL_SRC_FILES := \
@ -44,3 +53,19 @@ LOCAL_STATIC_LIBRARIES := example_util webp
 LOCAL_MODULE := dwebp

 include $(BUILD_EXECUTABLE)
+
+################################################################################
+# webpmux
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+    webpmux.c \
+
+LOCAL_CFLAGS := $(WEBP_CFLAGS)
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../src
+LOCAL_STATIC_LIBRARIES := example_util webpmux webp
+
+LOCAL_MODULE := webpmux_example
+
+include $(BUILD_EXECUTABLE)
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@ -22,6 +22,7 @@

 #include "webp/encode.h"

+#include "./example_util.h"
 #include "./metadata.h"
 #include "./stopwatch.h"

@ -709,6 +710,7 @@ int main(int argc, const char *argv[]) {
  }

  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      HelpShort();
      return 0;
@ -732,30 +734,31 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-short")) {
      ++short_output;
    } else if (!strcmp(argv[c], "-s") && c < argc - 2) {
-      picture.width = strtol(argv[++c], NULL, 0);
-      picture.height = strtol(argv[++c], NULL, 0);
+      picture.width = ExUtilGetInt(argv[++c], 0, &parse_error);
+      picture.height = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
-      config.method = strtol(argv[++c], NULL, 0);
+      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
 #endif
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
-      config.quality = (float)strtod(argv[++c], NULL);
+      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
 #if WEBP_ENCODER_ABI_VERSION > 0x0202
      use_lossless_preset = 0;   // disable -z option
    } else if (!strcmp(argv[c], "-z") && c < argc - 1) {
-      lossless_preset = strtol(argv[++c], NULL, 0);
+      lossless_preset = ExUtilGetInt(argv[++c], 0, &parse_error);
      if (use_lossless_preset != 0) use_lossless_preset = 1;
 #endif
    } else if (!strcmp(argv[c], "-alpha_q") && c < argc - 1) {
-      config.alpha_quality = strtol(argv[++c], NULL, 0);
+      config.alpha_quality = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_method") && c < argc - 1) {
-      config.alpha_compression = strtol(argv[++c], NULL, 0);
+      config.alpha_compression = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-alpha_cleanup")) {
      keep_alpha = keep_alpha ? 2 : 0;
    } else if (!strcmp(argv[c], "-blend_alpha") && c < argc - 1) {
      blend_alpha = 1;
-      background_color = strtol(argv[++c], NULL, 16);  // <- parses '0x' prefix
+      // background color is given in hex with an optional '0x' prefix
+      background_color = ExUtilGetInt(argv[++c], 16, &parse_error);
      background_color = background_color & 0x00ffffffu;
    } else if (!strcmp(argv[c], "-alpha_filter") && c < argc - 1) {
      ++c;
@ -786,13 +789,13 @@ int main(int argc, const char *argv[]) {
        goto Error;
      }
    } else if (!strcmp(argv[c], "-size") && c < argc - 1) {
-      config.target_size = strtol(argv[++c], NULL, 0);
+      config.target_size = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-psnr") && c < argc - 1) {
-      config.target_PSNR = (float)strtod(argv[++c], NULL);
+      config.target_PSNR = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-sns") && c < argc - 1) {
-      config.sns_strength = strtol(argv[++c], NULL, 0);
+      config.sns_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
-      config.filter_strength = strtol(argv[++c], NULL, 0);
+      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-af")) {
      config.autofilter = 1;
    } else if (!strcmp(argv[c], "-jpeg_like")) {
@ -806,26 +809,26 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-nostrong")) {
      config.filter_type = 0;
    } else if (!strcmp(argv[c], "-sharpness") && c < argc - 1) {
-      config.filter_sharpness = strtol(argv[++c], NULL, 0);
+      config.filter_sharpness = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-pass") && c < argc - 1) {
-      config.pass = strtol(argv[++c], NULL, 0);
+      config.pass = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-pre") && c < argc - 1) {
-      config.preprocessing = strtol(argv[++c], NULL, 0);
+      config.preprocessing = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-segments") && c < argc - 1) {
-      config.segments = strtol(argv[++c], NULL, 0);
+      config.segments = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-partition_limit") && c < argc - 1) {
-      config.partition_limit = strtol(argv[++c], NULL, 0);
+      config.partition_limit = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-map") && c < argc - 1) {
-      picture.extra_info_type = strtol(argv[++c], NULL, 0);
+      picture.extra_info_type = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      crop = 1;
-      crop_x = strtol(argv[++c], NULL, 0);
-      crop_y = strtol(argv[++c], NULL, 0);
-      crop_w = strtol(argv[++c], NULL, 0);
-      crop_h = strtol(argv[++c], NULL, 0);
+      crop_x = ExUtilGetInt(argv[++c], 0, &parse_error);
+      crop_y = ExUtilGetInt(argv[++c], 0, &parse_error);
+      crop_w = ExUtilGetInt(argv[++c], 0, &parse_error);
+      crop_h = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-resize") && c < argc - 2) {
-      resize_w = strtol(argv[++c], NULL, 0);
-      resize_h = strtol(argv[++c], NULL, 0);
+      resize_w = ExUtilGetInt(argv[++c], 0, &parse_error);
+      resize_h = ExUtilGetInt(argv[++c], 0, &parse_error);
 #ifndef WEBP_DLL
    } else if (!strcmp(argv[c], "-noasm")) {
      VP8GetCPUInfo = NULL;
@ -920,6 +923,11 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
+
+    if (parse_error) {
+      HelpLong();
+      return -1;
+    }
  }
  if (in_file == NULL) {
    fprintf(stderr, "No input file specified!\n");
@ -981,7 +989,7 @@ int main(int argc, const char *argv[]) {
  // Open the output
  if (out_file != NULL) {
    const int use_stdout = !strcmp(out_file, "-");
-    out = use_stdout ? stdout : fopen(out_file, "wb");
+    out = use_stdout ? ExUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
    if (out == NULL) {
      fprintf(stderr, "Error! Cannot open output file '%s'\n", out_file);
      goto Error;
@ -1126,7 +1134,7 @@ int main(int argc, const char *argv[]) {
  return_value = 0;

 Error:
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
  WebPMemoryWriterClear(&memory_writer);
 #else
  free(memory_writer.mem);
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@ -22,6 +22,7 @@

 #ifdef WEBP_HAVE_PNG
 #include <png.h>
+#include <setjmp.h>   // note: this must be included *after* png.h
 #endif

 #ifdef HAVE_WINCODEC_H
@ -38,11 +39,6 @@
 #include <wincodec.h>
 #endif

-#if defined(_WIN32)
-#include <fcntl.h>   // for _O_BINARY
-#include <io.h>      // for _setmode()
-#endif
-
 #include "webp/decode.h"
 #include "./example_util.h"
 #include "./stopwatch.h"
@ -197,8 +193,8 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  uint8_t* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_RGBA);
-  png_structp png;
-  png_infop info;
+  volatile png_structp png;
+  volatile png_infop info;
  png_uint_32 y;

  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
@ -208,11 +204,11 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  }
  info = png_create_info_struct(png);
  if (info == NULL) {
-    png_destroy_write_struct(&png, NULL);
+    png_destroy_write_struct((png_structpp)&png, NULL);
    return 0;
  }
  if (setjmp(png_jmpbuf(png))) {
-    png_destroy_write_struct(&png, &info);
+    png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
    return 0;
  }
  png_init_io(png, out_file);
@ -226,7 +222,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
    png_write_rows(png, &row, 1);
  }
  png_write_end(png, info);
-  png_destroy_write_struct(&png, &info);
+  png_destroy_write_struct((png_structpp)&png, (png_infopp)&info);
  return 1;
 }
 #else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG
@ -249,10 +245,10 @@ static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  uint32_t y;

  if (alpha) {
-    fprintf(fout, "P7\nWIDTH %d\nHEIGHT %d\nDEPTH 4\nMAXVAL 255\n"
+    fprintf(fout, "P7\nWIDTH %u\nHEIGHT %u\nDEPTH 4\nMAXVAL 255\n"
                  "TUPLTYPE RGB_ALPHA\nENDHDR\n", width, height);
  } else {
-    fprintf(fout, "P6\n%d %d\n255\n", width, height);
+    fprintf(fout, "P6\n%u %u\n255\n", width, height);
  }
  for (y = 0; y < height; ++y) {
    if (fwrite(rgb + y * stride, width, bytes_per_px, fout) != bytes_per_px) {
@ -409,7 +405,7 @@ static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  const int a_stride = buffer->u.YUVA.a_stride;
  uint32_t y;
  assert(a != NULL);
-  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  fprintf(fout, "P5\n%u %u\n255\n", width, height);
  for (y = 0; y < height; ++y) {
    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
      return 0;
@ -482,15 +478,8 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
  needs_open_file = (format != PNG);
 #endif

-#if defined(_WIN32)
-  if (use_stdout && _setmode(_fileno(stdout), _O_BINARY) == -1) {
-    fprintf(stderr, "Failed to reopen stdout in O_BINARY mode.\n");
-    return -1;
-  }
-#endif
-
  if (needs_open_file) {
-    fout = use_stdout ? stdout : fopen(out_file, "wb");
+    fout = use_stdout ? ExUtilSetBinaryMode(stdout) : fopen(out_file, "wb");
    if (fout == NULL) {
      fprintf(stderr, "Error opening output file %s\n", out_file);
      return 0;
@ -557,7 +546,7 @@ static void Help(void) {
         "  -nofilter .... disable in-loop filtering\n"
         "  -nodither .... disable dithering\n"
         "  -dither <d> .. dithering strength (in 0..100)\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -alpha_dither  use alpha-plane dithering if needed\n"
 #endif
         "  -mt .......... use multi-threading\n"
@ -598,6 +587,7 @@ int main(int argc, const char *argv[]) {
  }

  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
@ -628,24 +618,25 @@ int main(int argc, const char *argv[]) {
      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-alpha_dither")) {
      config.options.alpha_dithering_strength = 100;
 #endif
    } else if (!strcmp(argv[c], "-nodither")) {
      config.options.dithering_strength = 0;
    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
-      config.options.dithering_strength = strtol(argv[++c], NULL, 0);
+      config.options.dithering_strength =
+          ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      config.options.use_cropping = 1;
-      config.options.crop_left   = strtol(argv[++c], NULL, 0);
-      config.options.crop_top    = strtol(argv[++c], NULL, 0);
-      config.options.crop_width  = strtol(argv[++c], NULL, 0);
-      config.options.crop_height = strtol(argv[++c], NULL, 0);
+      config.options.crop_left   = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_top    = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.crop_height = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
      config.options.use_scaling = 1;
-      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
-      config.options.scaled_height = strtol(argv[++c], NULL, 0);
+      config.options.scaled_width  = ExUtilGetInt(argv[++c], 0, &parse_error);
+      config.options.scaled_height = ExUtilGetInt(argv[++c], 0, &parse_error);
 #if WEBP_DECODER_ABI_VERSION > 0x0203
    } else if (!strcmp(argv[c], "-flip")) {
      config.options.flip = 1;
@ -668,6 +659,11 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
  }

  if (in_file == NULL) {
--- a/examples/example_util.c
+++ b/examples/example_util.c
@ -11,6 +11,11 @@
 //

 #include "./example_util.h"
+
+#if defined(_WIN32)
+#include <fcntl.h>   // for _O_BINARY
+#include <io.h>      // for _setmode()
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -18,12 +23,50 @@
 #include "webp/decode.h"
 #include "./stopwatch.h"

+//------------------------------------------------------------------------------
+// String parsing
+
+uint32_t ExUtilGetUInt(const char* const v, int base, int* const error) {
+  char* end = NULL;
+  const uint32_t n = (v != NULL) ? (uint32_t)strtoul(v, &end, base) : 0u;
+  if (end == v && error != NULL && !*error) {
+    *error = 1;
+    fprintf(stderr, "Error! '%s' is not an integer.\n",
+            (v != NULL) ? v : "(null)");
+  }
+  return n;
+}
+
+int ExUtilGetInt(const char* const v, int base, int* const error) {
+  return (int)ExUtilGetUInt(v, base, error);
+}
+
+float ExUtilGetFloat(const char* const v, int* const error) {
+  char* end = NULL;
+  const float f = (v != NULL) ? (float)strtod(v, &end) : 0.f;
+  if (end == v && error != NULL && !*error) {
+    *error = 1;
+    fprintf(stderr, "Error! '%s' is not a floating point number.\n",
+            (v != NULL) ? v : "(null)");
+  }
+  return f;
+}
+
 // -----------------------------------------------------------------------------
 // File I/O

-static const size_t kBlockSize = 16384;  // default initial size
+FILE* ExUtilSetBinaryMode(FILE* file) {
+#if defined(_WIN32)
+  if (_setmode(_fileno(file), _O_BINARY) == -1) {
+    fprintf(stderr, "Failed to reopen file in O_BINARY mode.\n");
+    return NULL;
+  }
+#endif
+  return file;
+}

 int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
+  static const size_t kBlockSize = 16384;  // default initial size
  size_t max_size = 0;
  size_t size = 0;
  uint8_t* input = NULL;
@ -32,6 +75,8 @@ int ExUtilReadFromStdin(const uint8_t** data, size_t* data_size) {
  *data = NULL;
  *data_size = 0;

+  if (!ExUtilSetBinaryMode(stdin)) return 0;
+
  while (!feof(stdin)) {
    // We double the buffer size each time and read as much as possible.
    const size_t extra_size = (max_size == 0) ? kBlockSize : max_size;
--- a/examples/example_util.h
+++ b/examples/example_util.h
@ -13,12 +13,30 @@
 #ifndef WEBP_EXAMPLES_EXAMPLE_UTIL_H_
 #define WEBP_EXAMPLES_EXAMPLE_UTIL_H_

+#include <stdio.h>
 #include "webp/decode.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

+//------------------------------------------------------------------------------
+// String parsing
+
+// Parses 'v' using strto(ul|l|d)(). If error is non-NULL, '*error' is set to
+// true on failure while on success it is left unmodified to allow chaining of
+// calls. An error is only printed on the first occurrence.
+uint32_t ExUtilGetUInt(const char* const v, int base, int* const error);
+int ExUtilGetInt(const char* const v, int base, int* const error);
+float ExUtilGetFloat(const char* const v, int* const error);
+
+//------------------------------------------------------------------------------
+// File I/O
+
+// Reopen file in binary (O_BINARY) mode.
+// Returns 'file' on success, NULL otherwise.
+FILE* ExUtilSetBinaryMode(FILE* file);
+
 // Allocates storage for entire file 'file_name' and returns contents and size
 // in 'data' and 'data_size'. Returns 1 on success, 0 otherwise. '*data' should
 // be deleted using free().
--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@ -46,7 +46,7 @@

 //------------------------------------------------------------------------------

-static int transparent_index = -1;  // Index of transparent color in the map.
+static int transparent_index = -1;  // Opaque frame by default.

 static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
                                      size_t* const kmax_ptr) {
@ -163,21 +163,22 @@ static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
  return ok;
 }

-static int GetBackgroundColor(const ColorMapObject* const color_map,
-                              int bgcolor_idx, uint32_t* const bgcolor) {
+static void GetBackgroundColor(const ColorMapObject* const color_map,
+                               int bgcolor_idx, uint32_t* const bgcolor) {
  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
-    return 1;
  } else if (color_map == NULL || color_map->Colors == NULL
             || bgcolor_idx >= color_map->ColorCount) {
-    return 0;  // Invalid color map or index.
+    *bgcolor = WHITE_COLOR;
+    fprintf(stderr,
+            "GIF decode warning: invalid background color index. Assuming "
+            "white background.\n");
  } else {
    const GifColorType color = color_map->Colors[bgcolor_idx];
    *bgcolor = (0xff        << 24)
             | (color.Red   << 16)
             | (color.Green <<  8)
             | (color.Blue  <<  0);
-    return 1;
  }
 }

@ -258,7 +259,8 @@ int main(int argc, const char *argv[]) {
  GifFileType* gif = NULL;
  WebPConfig config;
  WebPPicture frame;
-  WebPMuxFrameInfo info;
+  int duration = 0;
+  FrameDisposeMethod orig_dispose = FRAME_DISPOSE_NONE;
  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
  WebPFrameCache* cache = NULL;

@ -278,17 +280,11 @@ int main(int argc, const char *argv[]) {
  size_t kmax = 0;
  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.

-  memset(&info, 0, sizeof(info));
-  info.id = WEBP_CHUNK_ANMF;
-  info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
-  info.blend_method = WEBP_MUX_BLEND;
-
  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
  config.lossless = 1;  // Use lossless compression by default.
-  config.image_hint = WEBP_HINT_GRAPH;   // always low-color

  if (argc == 1) {
    Help();
@ -296,6 +292,7 @@ int main(int argc, const char *argv[]) {
  }

  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
@ -307,17 +304,17 @@ int main(int argc, const char *argv[]) {
      allow_mixed = 1;
      config.lossless = 0;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
-      config.quality = (float)strtod(argv[++c], NULL);
+      config.quality = ExUtilGetFloat(argv[++c], &parse_error);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
-      config.method = strtol(argv[++c], NULL, 0);
+      config.method = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
-      kmax = strtoul(argv[++c], NULL, 0);
+      kmax = ExUtilGetUInt(argv[++c], 0, &parse_error);
      default_kmax = 0;
    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
-      kmin = strtoul(argv[++c], NULL, 0);
+      kmin = ExUtilGetUInt(argv[++c], 0, &parse_error);
      default_kmin = 0;
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
-      config.filter_strength = strtol(argv[++c], NULL, 0);
+      config.filter_strength = ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-metadata") && c < argc - 1) {
      static const struct {
        const char* option;
@ -381,6 +378,11 @@ int main(int argc, const char *argv[]) {
    } else {
      in_file = argv[c];
    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
  }

  // Appropriate default kmin, kmax values for lossy and lossless.
@ -469,6 +471,10 @@ int main(int argc, const char *argv[]) {
          cache = WebPFrameCacheNew(frame.width, frame.height,
                                    kmin, kmax, allow_mixed);
          if (cache == NULL) goto End;
+
+          // Background color.
+          GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
+                             &anim.bgcolor);
        }
        // Some even more broken GIF can have sub-rect with zero width/height.
        if (image_desc->Width == 0 || image_desc->Height == 0) {
@ -480,7 +486,8 @@ int main(int argc, const char *argv[]) {
          goto End;
        }

-        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, &frame, &info)) {
+        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, orig_dispose,
+                                    duration, &frame)) {
          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
          fprintf(stderr, "Error code: %d\n", frame.error_code);
        }
@ -492,6 +499,13 @@ int main(int argc, const char *argv[]) {
          goto End;
        }
        is_first_frame = 0;
+
+        // In GIF, graphic control extensions are optional for a frame, so we
+        // may not get one before reading the next frame. To handle this case,
+        // we reset frame properties to reasonable defaults for the next frame.
+        orig_dispose = FRAME_DISPOSE_NONE;
+        duration = 0;
+        transparent_index = -1;  // Opaque frame by default.
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@ -509,29 +523,21 @@ int main(int argc, const char *argv[]) {
            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
            if (data[0] != 4) goto End;
-            info.duration = delay * 10;  // Duration is in 1 ms units for WebP.
-            if (dispose == 3) {
-              static int warning_printed = 0;
-              if (!warning_printed) {
-                fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE unsupported.\n");
-                warning_printed = 1;
-              }
-              // failsafe. TODO(urvang): emulate the correct behaviour by
-              // recoding the whole frame.
-              info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
-            } else {
-              info.dispose_method =
-                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
-                                 : WEBP_MUX_DISPOSE_NONE;
+            duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            switch (dispose) {
+              case 3:
+                orig_dispose = FRAME_DISPOSE_RESTORE_PREVIOUS;
+                break;
+              case 2:
+                orig_dispose = FRAME_DISPOSE_BACKGROUND;
+                break;
+              case 1:
+              case 0:
+              default:
+                orig_dispose = FRAME_DISPOSE_NONE;
+                break;
            }
            transparent_index = (flags & GIF_TRANSPARENT_MASK) ? data[4] : -1;
-            if (is_first_frame) {
-              if (!GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
-                                      &anim.bgcolor)) {
-                fprintf(stderr, "GIF decode warning: invalid background color "
-                                "index. Assuming white background.\n");
-              }
-            }
            break;
          }
          case PLAINTEXT_EXT_FUNC_CODE: {
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@ -29,6 +29,13 @@ extern "C" {

 struct WebPPicture;

+// Includes all disposal methods, even the ones not supported by WebP bitstream.
+typedef enum FrameDisposeMethod {
+  FRAME_DISPOSE_NONE,
+  FRAME_DISPOSE_BACKGROUND,
+  FRAME_DISPOSE_RESTORE_PREVIOUS
+} FrameDisposeMethod;
+
 typedef struct {
  int x_offset, y_offset, width, height;
 } WebPFrameRect;
@ -53,15 +60,15 @@ WebPFrameCache* WebPFrameCacheNew(int width, int height,
 // Release all the frame data from 'cache' and free 'cache'.
 void WebPFrameCacheDelete(WebPFrameCache* const cache);

-// Given an image described by 'frame', 'info' and 'orig_rect', optimize it for
-// WebP, encode it and add it to 'cache'. 'orig_rect' can be NULL.
-// This takes care of frame disposal too, according to 'info->dispose_method'.
+// Given an image described by 'frame', 'rect', 'dispose_method' and 'duration',
+// optimize it for WebP, encode it and add it to 'cache'. 'rect' can be NULL.
+// This takes care of frame disposal too, according to 'dispose_method'.
 // Returns false in case of error (and sets frame->error_code accordingly).
 int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
                           const WebPConfig* const config,
-                           const WebPFrameRect* const orig_rect,
-                           WebPPicture* const frame,
-                           WebPMuxFrameInfo* const info);
+                           const WebPFrameRect* const rect,
+                           FrameDisposeMethod dispose_method, int duration,
+                           WebPPicture* const frame);

 // Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
 // true, prints the information about these frames.
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@ -14,6 +14,8 @@
 #ifndef WEBP_EXAMPLES_STOPWATCH_H_
 #define WEBP_EXAMPLES_STOPWATCH_H_

+#include "webp/types.h"
+
 #if defined _WIN32 && !defined __GNUC__
 #include <windows.h>

@ -37,6 +39,7 @@ static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {


 #else    /* !_WIN32 */
+#include <string.h>  // memcpy
 #include <sys/time.h>

 typedef struct timeval Stopwatch;
--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@ -42,8 +42,6 @@
 #define snprintf _snprintf
 #endif

-static void Help(void);
-
 // Unfortunate global variables. Gathered into a struct for comfort.
 static struct {
  int has_animation;
@ -82,6 +80,16 @@ static void ClearParams(void) {
  kParams.dmux = NULL;
 }

+// Sets the previous frame to the dimensions of the canvas and has it dispose
+// to background to cause the canvas to be cleared.
+static void ClearPreviousFrame(void) {
+  WebPIterator* const prev = &kParams.prev_frame;
+  prev->width = kParams.canvas_width;
+  prev->height = kParams.canvas_height;
+  prev->x_offset = prev->y_offset = 0;
+  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+}
+
 // -----------------------------------------------------------------------------
 // Color profile handling
 static int ApplyColorProfile(const WebPData* const profile,
@ -181,6 +189,8 @@ static void decode_callback(int what) {
        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
          --kParams.loop_count;
          kParams.done = (kParams.loop_count == 0);
+          if (kParams.done) return;
+          ClearPreviousFrame();
        } else {
          kParams.decoding_error = 1;
          kParams.done = 1;
@ -298,19 +308,24 @@ static void HandleDisplay(void) {
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
    int window_x, window_y;
+    int frame_w, frame_h;
    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
      // Clear the previous frame rectangle.
      window_x = prev->x_offset;
      window_y = kParams.canvas_height - prev->y_offset - prev->height;
+      frame_w = prev->width;
+      frame_h = prev->height;
    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
      // We simulate no-blending behavior by first clearing the current frame
      // rectangle (to a checker-board) and then alpha-blending against it.
      window_x = curr->x_offset;
      window_y = kParams.canvas_height - curr->y_offset - curr->height;
+      frame_w = curr->width;
+      frame_h = curr->height;
    }
    glEnable(GL_SCISSOR_TEST);
    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y, prev->width, prev->height);
+    glScissor(window_x, window_y, frame_w, frame_h);

    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();
@ -377,7 +392,7 @@ static void Help(void) {
         "  -nofancy ..... don't use the fancy YUV420 upscaler\n"
         "  -nofilter .... disable in-loop filtering\n"
         "  -dither <int>  dithering strength (0..100), default=50\n"
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
         "  -noalphadither disable alpha plane dithering\n"
 #endif
         "  -mt .......... use multi-threading\n"
@ -395,19 +410,19 @@ int main(int argc, char *argv[]) {
  int c;
  WebPDecoderConfig* const config = &kParams.config;
  WebPIterator* const curr = &kParams.curr_frame;
-  WebPIterator* const prev = &kParams.prev_frame;

  if (!WebPInitDecoderConfig(config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return -1;
  }
  config->options.dithering_strength = 50;
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
  config->options.alpha_dithering_strength = 100;
 #endif
  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
+    int parse_error = 0;
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
@ -417,12 +432,13 @@ int main(int argc, char *argv[]) {
      config->options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
      config->options.bypass_filtering = 1;
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
    } else if (!strcmp(argv[c], "-noalphadither")) {
      config->options.alpha_dithering_strength = 0;
 #endif
    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
-      config->options.dithering_strength = strtol(argv[++c], NULL, 0);
+      config->options.dithering_strength =
+          ExUtilGetInt(argv[++c], 0, &parse_error);
    } else if (!strcmp(argv[c], "-info")) {
      kParams.print_info = 1;
    } else if (!strcmp(argv[c], "-version")) {
@ -445,6 +461,11 @@ int main(int argc, char *argv[]) {
    } else {
      kParams.file_name = argv[c];
    }
+
+    if (parse_error) {
+      Help();
+      return -1;
+    }
  }

  if (kParams.file_name == NULL) {
@ -479,10 +500,7 @@ int main(int argc, char *argv[]) {
    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
  }

-  prev->width = kParams.canvas_width;
-  prev->height = kParams.canvas_height;
-  prev->x_offset = prev->y_offset = 0;
-  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+  ClearPreviousFrame();

  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
  kParams.has_color_profile =
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@ -402,8 +402,9 @@ static int CreateMux(const char* const filename, WebPMux** mux) {

 static int WriteData(const char* filename, const WebPData* const webpdata) {
  int ok = 0;
-  FILE* fout = strcmp(filename, "-") ? fopen(filename, "wb") : stdout;
-  if (!fout) {
+  FILE* fout = strcmp(filename, "-") ? fopen(filename, "wb")
+                                     : ExUtilSetBinaryMode(stdout);
+  if (fout == NULL) {
    fprintf(stderr, "Error opening output WebP file %s!\n", filename);
    return 0;
  }
@ -487,7 +488,7 @@ static int ParseBgcolorArgs(const char* args, uint32_t* const bgcolor) {
 static void DeleteConfig(WebPMuxConfig* config) {
  if (config != NULL) {
    free(config->feature_.args_);
-    free(config);
+    memset(config, 0, sizeof(*config));
  }
 }

@ -790,33 +791,27 @@ static int ValidateConfig(WebPMuxConfig* config) {

 // Create config object from command-line arguments.
 static int InitializeConfig(int argc, const char* argv[],
-                            WebPMuxConfig** config) {
+                            WebPMuxConfig* config) {
  int num_feature_args = 0;
  int ok = 1;

  assert(config != NULL);
-  *config = NULL;
+  memset(config, 0, sizeof(*config));

  // Validate command-line arguments.
  if (!ValidateCommandLine(argc, argv, &num_feature_args)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }

-  // Allocate memory.
-  *config = (WebPMuxConfig*)calloc(1, sizeof(**config));
-  if (*config == NULL) {
-    ERROR_GOTO1("ERROR: Memory allocation error.\n", Err1);
-  }
-  (*config)->feature_.arg_count_ = num_feature_args;
-  (*config)->feature_.args_ =
-      (FeatureArg*)calloc(num_feature_args, sizeof(FeatureArg));
-  if ((*config)->feature_.args_ == NULL) {
+  config->feature_.arg_count_ = num_feature_args;
+  config->feature_.args_ =
+      (FeatureArg*)calloc(num_feature_args, sizeof(*config->feature_.args_));
+  if (config->feature_.args_ == NULL) {
    ERROR_GOTO1("ERROR: Memory allocation error.\n", Err1);
  }

  // Parse command-line.
-  if (!ParseCommandLine(argc, argv, *config) ||
-      !ValidateConfig(*config)) {
+  if (!ParseCommandLine(argc, argv, config) || !ValidateConfig(config)) {
    ERROR_GOTO1("Exiting due to command-line parsing error.\n", Err1);
  }

@ -838,14 +833,16 @@ static int GetFrameFragment(const WebPMux* mux,
  WebPMux* mux_single = NULL;
  long num = 0;
  int ok = 1;
+  int parse_error = 0;
  const WebPChunkId id = is_frame ? WEBP_CHUNK_ANMF : WEBP_CHUNK_FRGM;
  WebPMuxFrameInfo info;
  WebPDataInit(&info.bitstream);

-  num = strtol(config->feature_.args_[0].params_, NULL, 10);
+  num = ExUtilGetInt(config->feature_.args_[0].params_, 10, &parse_error);
  if (num < 0) {
    ERROR_GOTO1("ERROR: Frame/Fragment index must be non-negative.\n", ErrGet);
  }
+  if (parse_error) goto ErrGet;

  err = WebPMuxGetFrame(mux, num, &info);
  if (err == WEBP_MUX_OK && info.id != id) err = WEBP_MUX_NOT_FOUND;
@ -871,7 +868,7 @@ static int GetFrameFragment(const WebPMux* mux,
 ErrGet:
  WebPDataClear(&info.bitstream);
  WebPMuxDelete(mux_single);
-  return ok;
+  return ok && !parse_error;
 }

 // Read and process config.
@ -933,16 +930,19 @@ static int Process(const WebPMuxConfig* config) {
                break;
              }
              case SUBTYPE_LOOP: {
-                const long loop_count =
-                    strtol(feature->args_[i].params_, NULL, 10);
-                if (loop_count != (int)loop_count) {
+                int parse_error = 0;
+                const int loop_count =
+                    ExUtilGetInt(feature->args_[i].params_, 10, &parse_error);
+                if (loop_count < 0 || loop_count > 65535) {
                  // Note: This is only a 'necessary' condition for loop_count
                  // to be valid. The 'sufficient' conditioned in checked in
                  // WebPMuxSetAnimationParams() method called later.
                  ERROR_GOTO1("ERROR: Loop count must be in the range 0 to "
                              "65535.\n", Err2);
                }
-                params.loop_count = (int)loop_count;
+                ok = !parse_error;
+                if (!ok) goto Err2;
+                params.loop_count = loop_count;
                break;
              }
              case SUBTYPE_ANMF: {
@ -1042,8 +1042,8 @@ static int Process(const WebPMuxConfig* config) {
                      ErrorString(err), kDescriptions[feature->type_], Err2);
        }
      } else {
-          ERROR_GOTO1("ERROR: Invalid feature for action 'strip'.\n", Err2);
-          break;
+        ERROR_GOTO1("ERROR: Invalid feature for action 'strip'.\n", Err2);
+        break;
      }
      ok = WriteWebP(mux, config->output_);
      break;
@ -1069,14 +1069,14 @@ static int Process(const WebPMuxConfig* config) {
 // Main.

 int main(int argc, const char* argv[]) {
-  WebPMuxConfig* config;
+  WebPMuxConfig config;
  int ok = InitializeConfig(argc - 1, argv + 1, &config);
  if (ok) {
-    ok = Process(config);
+    ok = Process(&config);
  } else {
    PrintHelp();
  }
-  DeleteConfig(config);
+  DeleteConfig(&config);
  return !ok;
 }

--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@ -15,6 +15,7 @@
 #include "webp/config.h"
 #endif

+#include <assert.h>
 #include <stdio.h>

 #ifdef HAVE_WINCODEC_H
@ -72,6 +73,12 @@ WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppBGRA_,
 WEBP_DEFINE_GUID(GUID_WICPixelFormat32bppRGBA_,
                 0xf5c7ad2d, 0x6a8d, 0x43dd,
                 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppBGRA_,
+                 0x1562ff7c, 0xd352, 0x46f9,
+                 0x97, 0x9e, 0x42, 0x97, 0x6b, 0x79, 0x22, 0x46);
+WEBP_DEFINE_GUID(GUID_WICPixelFormat64bppRGBA_,
+                 0x6fddc324, 0x4e03, 0x4bfe,
+                 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x16);

 static HRESULT OpenInputStream(const char* filename, IStream** stream) {
  HRESULT hr = S_OK;
@ -109,6 +116,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
    IFS(IWICBitmapFrameDecode_GetColorContexts(frame,
                                               count, color_contexts,
                                               &num_color_contexts));
+    assert(FAILED(hr) || num_color_contexts <= count);
    for (i = 0; SUCCEEDED(hr) && i < num_color_contexts; ++i) {
      WICColorContextType type;
      IFS(IWICColorContext_GetType(color_contexts[i], &type));
@ -116,7 +124,7 @@ static HRESULT ExtractICCP(IWICImagingFactory* const factory,
        UINT size;
        IFS(IWICColorContext_GetProfileBytes(color_contexts[i],
                                             0, NULL, &size));
-        if (size > 0) {
+        if (SUCCEEDED(hr) && size > 0) {
          iccp->bytes = (uint8_t*)malloc(size);
          if (iccp->bytes == NULL) {
            hr = E_OUTOFMEMORY;
@ -194,7 +202,11 @@ static int HasAlpha(IWICImagingFactory* const factory,
    has_alpha = IsEqualGUID(MAKE_REFGUID(pixel_format),
                            MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA_)) ||
                IsEqualGUID(MAKE_REFGUID(pixel_format),
-                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_));
+                            MAKE_REFGUID(GUID_WICPixelFormat32bppBGRA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppRGBA_)) ||
+                IsEqualGUID(MAKE_REFGUID(pixel_format),
+                            MAKE_REFGUID(GUID_WICPixelFormat64bppBGRA_));
  }
  return has_alpha;
 }
@ -261,7 +273,7 @@ int ReadPictureWithWIC(const char* const filename,
  IFS(IWICBitmapFrameDecode_GetPixelFormat(frame, &src_pixel_format));
  IFS(IWICBitmapDecoder_GetContainerFormat(decoder, &src_container_format));

-  if (keep_alpha) {
+  if (SUCCEEDED(hr) && keep_alpha) {
    const GUID** guid;
    for (guid = kAlphaContainers; *guid != NULL; ++guid) {
      if (IsEqualGUID(MAKE_REFGUID(src_container_format),
--- a/iosbuild.sh
+++ b/iosbuild.sh
@ -12,31 +12,39 @@
 set -e

 # Extract the latest SDK version from the final field of the form: iphoneosX.Y
-declare -r SDK=$(xcodebuild -showsdks \
+readonly SDK=$(xcodebuild -showsdks \
  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
 )
 # Extract Xcode version.
-declare -r XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
+readonly XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
+if [[ -z "${XCODE}" ]]; then
+  echo "Xcode not available"
+  exit 1
+fi

-declare -r OLDPATH=${PATH}
+readonly OLDPATH=${PATH}

 # Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
 # Note that iPhoneOS-V6 support is not available with the iOS6 SDK.
-declare -r PLATFORMS="iPhoneSimulator iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
-declare -r SRCDIR=$(dirname $0)
-declare -r TOPDIR=$(pwd)
-declare -r BUILDDIR="${TOPDIR}/iosbuild"
-declare -r TARGETDIR="${TOPDIR}/WebP.framework"
-declare -r DEVELOPER=$(xcode-select --print-path)
-declare -r PLATFORMSROOT="${DEVELOPER}/Platforms"
-declare -r LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+PLATFORMS="iPhoneSimulator iPhoneSimulator64"
+PLATFORMS+=" iPhoneOS-V7 iPhoneOS-V7s iPhoneOS-V7-arm64"
+readonly PLATFORMS
+readonly SRCDIR=$(dirname $0)
+readonly TOPDIR=$(pwd)
+readonly BUILDDIR="${TOPDIR}/iosbuild"
+readonly TARGETDIR="${TOPDIR}/WebP.framework"
+readonly DEVELOPER=$(xcode-select --print-path)
+readonly PLATFORMSROOT="${DEVELOPER}/Platforms"
+readonly LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 LIBLIST=''

 if [[ -z "${SDK}" ]]; then
  echo "iOS SDK not available"
  exit 1
-elif [[ ${SDK} < 4.0 ]]; then
-  echo "You need iOS SDK version 4.0 or above"
+elif [[ ${SDK%%.*} -gt 8 ]]; then
+  EXTRA_CFLAGS="-fembed-bitcode"
+elif [[ ${SDK} < 6.0 ]]; then
+  echo "You need iOS SDK version 6.0 or above"
  exit 1
 else
  echo "iOS SDK Version ${SDK}"
@ -47,7 +55,17 @@ rm -rf ${TARGETDIR}
 mkdir -p ${BUILDDIR}
 mkdir -p ${TARGETDIR}/Headers/

-[[ -e ${SRCDIR}/configure ]] || (cd ${SRCDIR} && sh autogen.sh)
+if [[ ! -e ${SRCDIR}/configure ]]; then
+  if ! (cd ${SRCDIR} && sh autogen.sh); then
+    cat <<EOT
+Error creating configure script!
+This script requires the autoconf/automake and libtool to build. MacPorts can
+be used to obtain these:
+http://www.macports.org/install.php
+EOT
+    exit 1
+  fi
+fi

 for PLATFORM in ${PLATFORMS}; do
  ARCH2=""
@ -64,6 +82,9 @@ for PLATFORM in ${PLATFORMS}; do
  elif [[ "${PLATFORM}" == "iPhoneOS-V6" ]]; then
    PLATFORM="iPhoneOS"
    ARCH="armv6"
+  elif [[ "${PLATFORM}" == "iPhoneSimulator64" ]]; then
+    PLATFORM="iPhoneSimulator"
+    ARCH="x86_64"
  else
    ARCH="i386"
  fi
@ -71,30 +92,20 @@ for PLATFORM in ${PLATFORMS}; do
  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
  mkdir -p "${ROOTDIR}"

-  SDKROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
+  DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
+  SDKROOT="${PLATFORMSROOT}/"
+  SDKROOT+="${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
  CFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT} -O3 -DNDEBUG"
-  LDFLAGS="-arch ${ARCH2:-${ARCH}} -pipe -isysroot ${SDKROOT}"
+  CFLAGS+=" -miphoneos-version-min=6.0 ${EXTRA_CFLAGS}"

-  if [[ -z "${XCODE}" ]]; then
-    echo "XCODE not available"
-    exit 1
-  elif [[ ${SDK} < 5.0.0 ]]; then
-    DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/"
-  else
-    DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
-    CFLAGS+=" -miphoneos-version-min=5.0"
-    LDFLAGS+=" -miphoneos-version-min=5.0"
-  fi
-
-  export CFLAGS
-  export LDFLAGS
-  export CXXFLAGS=${CFLAGS}
+  set -x
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"
-
  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
    --build=$(${SRCDIR}/config.guess) \
    --disable-shared --enable-static \
-    --enable-libwebpdecoder --enable-swap-16bit-csp
+    --enable-libwebpdecoder --enable-swap-16bit-csp \
+    CFLAGS="${CFLAGS}"
+  set +x

  # run make only in the src/ directory to create libwebpdecoder.a
  cd src/
@ -109,5 +120,5 @@ for PLATFORM in ${PLATFORMS}; do
  export PATH=${OLDPATH}
 done

-cp -a ${SRCDIR}/src/webp/* ${TARGETDIR}/Headers/
+cp -a ${SRCDIR}/src/webp/*.h ${TARGETDIR}/Headers/
 ${LIPO} -create ${LIBLIST} -output ${TARGETDIR}/WebP
--- a/makefile.unix
+++ b/makefile.unix
@ -67,6 +67,8 @@ EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
 EXTRA_FLAGS += -Wshadow
+EXTRA_FLAGS += -Wformat-security -Wformat-nonliteral
+
 # EXTRA_FLAGS += -Wvla

 # AVX2-specific flags:
@ -108,6 +110,7 @@ DEMUX_OBJS = \

 DSP_DEC_OBJS = \
    src/dsp/alpha_processing.o \
+    src/dsp/alpha_processing_sse2.o \
    src/dsp/cpu.o \
    src/dsp/dec.o \
    src/dsp/dec_clip_tables.o \
--- a/man/cwebp.1
+++ b/man/cwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "July 22, 2014"
+.TH CWEBP 1 "October 19, 2015"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@ -38,6 +38,13 @@ Print the version number (as major.minor.revision) and exit.
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
+.br
+In case of lossy compression (default), a small factor produces a smaller file
+with lower quality. Best quality is achieved by using a value of 100.
+.br
+In case of lossless compression (specified by the \-lossless option), a small
+factor enables faster compression speed, but produces a larger file. Maximum
+compression is achieved by using a value of 100.
 .\" TODO(jzern): restore post-v0.4.1
 .\" .TP
 .\" .BI \-z " int
@ -47,13 +54,6 @@ is 75.
 .\" This option is actually a shortcut for some predefined settings for quality
 .\" and method. If options \-q  or \-m are subsequently used, they will invalidate
 .\" the effect of this \-z option.
-.\" .br
-.\" In case of lossy compression (default), a small factor produces a smaller file
-.\" with lower quality. Best quality is achieved by using a value of 100.
-.\" .br
-.\" In case of lossless compression (specified by the \-lossless option), a small
-.\" factor enables faster compression speed, but produces a larger file. Maximum
-.\" compression is achieved by using a value of 100.
 .TP
 .BI \-alpha_q " int
 Specify the compression factor for alpha compression between 0 and 100.
@ -259,7 +259,7 @@ Only print brief information (output file size and PSNR) for testing purpose.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/dwebp.1
+++ b/man/dwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "July 22, 2014"
+.TH DWEBP 1 "October 19, 2015"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@ -106,7 +106,7 @@ Disable all assembly optimizations.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "March 7, 2014"
+.TH GIF2WEBP 1 "October 19, 2015"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@ -111,7 +111,7 @@ Do not print anything.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/vwebp.1
+++ b/man/vwebp.1
@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH VWEBP 1 "July 23, 2014"
+.TH VWEBP 1 "October 19, 2015"
 .SH NAME
 vwebp \- decompress a WebP file and display it in a window
 .SH SYNOPSIS
@ -65,7 +65,7 @@ Quit.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
 http://www.webmproject.org/code/contribute/submitting-patches/
--- a/man/webpmux.1
+++ b/man/webpmux.1
@ -1,7 +1,8 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "December 17, 2013"
+.TH WEBPMUX 1 "October 19, 2015"
 .SH NAME
-webpmux \- command line tool to create WebP Mux/container file.
+webpmux \- create animated WebP files from non\-animated WebP images, extract
+frames from animated WebP images, and manage XMP/EXIF metadata and ICC profile.
 .SH SYNOPSIS
 .B webpmux \-get
 .I GET_OPTIONS
@ -45,8 +46,8 @@ This manual page documents the
 .B webpmux
 command.
 .PP
-\fBwebpmux\fP can be used to create a WebP container file
-and extract/strip relevant data from the container file.
+\fBwebpmux\fP can be used to create/extract from animated WebP files, as well as
+to add/extract/strip XMP/EXIF metadata and ICC profile.
 .SH OPTIONS
 .SS GET_OPTIONS (\-get):
 .TP
@ -60,7 +61,7 @@ Get EXIF metadata.
 Get XMP metadata.
 .TP
 .BI frame " n
-Get nth frame.
+Get nth frame from an animated image. (n = 0 has a special meaning: last frame).

 .SS SET_OPTIONS (\-set)
 .TP
@ -91,12 +92,13 @@ Strip EXIF metadata.
 Strip XMP metadata.

 .SS FRAME_OPTIONS (\-frame)
+Create an animated WebP file from multiple (non\-animated) WebP images.
 .TP
 .I file_i +di[+xi+yi[+mi[bi]]]
 Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
 offset for this frame, 'di' is the pause duration before next frame, 'mi' is
 the dispose method for this frame (0 for NONE or 1 for BACKGROUND) and 'bi' is
-the blending method for this frame (+b for BLEND or -b for NO_BLEND).
+the blending method for this frame (+b for BLEND or \-b for NO_BLEND).
 Argument 'bi' can be omitted and will default to +b (BLEND).
 Also, 'mi' can be omitted if 'bi' is omitted and will default to 0 (NONE).
 Finally, if 'mi' and 'bi' are omitted then 'xi' and 'yi' can be omitted and will
@ -127,40 +129,64 @@ The nature of EXIF, XMP and ICC data is not checked and is assumed to be valid.

 .SH BUGS
 Please report all bugs to our issue tracker:
-http://code.google.com/p/webp/issues
+https://bugs.chromium.org/p/webp
 .br
 Patches welcome! See this page to get started:
-http://www.webmproject.org/code/contribute/submitting-patches/
+http://www.webmproject.org/code/contribute/submitting\-patches/

 .SH EXAMPLES
+.P
+Add ICC profile:
+.br
 webpmux \-set icc image_profile.icc in.webp \-o icc_container.webp
+.P
+Extract ICC profile:
 .br
 webpmux \-get icc icc_container.webp \-o image_profile.icc
+.P
+Strip ICC profile:
 .br
 webpmux \-strip icc icc_container.webp \-o without_icc.webp
+.P
+Add XMP metadata:
 .br
 webpmux \-set xmp image_metadata.xmp in.webp \-o xmp_container.webp
+.P
+Extract XMP metadata:
 .br
 webpmux \-get xmp xmp_container.webp \-o image_metadata.xmp
+.P
+Strip XMP metadata:
 .br
 webpmux \-strip xmp xmp_container.webp \-o without_xmp.webp
+.P
+Add EXIF metadata:
 .br
 webpmux \-set exif image_metadata.exif in.webp \-o exif_container.webp
+.P
+Extract EXIF metadata:
 .br
 webpmux \-get exif exif_container.webp \-o image_metadata.exif
+.P
+Strip EXIF metadata:
 .br
 webpmux \-strip exif exif_container.webp \-o without_exif.webp
+.P
+Create an animated WebP file from 3 (non\-animated) WebP images:
 .br
-webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50
+webpmux \-frame 1.webp +100 \-frame 2.webp +100+50+50
 .br
 .RS 8
-\-frame anim_2.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
+\-frame 3.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
 .br
-.RS 8
 \-o anim_container.webp
 .RE
+.P
+Get the 2nd frame from an animated WebP file:
 .br
 webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
+.P
+Using \-get/\-set/\-strip with input file name starting with '\-':
 .br
 webpmux \-set icc image_profile.icc \-o icc_container.webp \-\- \-\-\-in.webp
 .br
--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -35,7 +35,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 5:1:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 5:4:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@ -47,7 +47,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:1:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:4:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@ -33,6 +33,11 @@ static int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }

+// strictly speaking, the very last (or first, if flipped) row
+// doesn't require padding.
+#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
+    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
  const WEBP_CSP_MODE mode = buffer->colorspace;
@ -42,20 +47,22 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int uv_width  = (width  + 1) / 2;
+    const int uv_height = (height + 1) / 2;
    const int y_stride = abs(buf->y_stride);
    const int u_stride = abs(buf->u_stride);
    const int v_stride = abs(buf->v_stride);
    const int a_stride = abs(buf->a_stride);
-    const uint64_t y_size = (uint64_t)y_stride * height;
-    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
-    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
-    const uint64_t a_size = (uint64_t)a_stride * height;
+    const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
+    const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
+    const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
+    const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
    ok &= (y_stride >= width);
-    ok &= (u_stride >= (width + 1) / 2);
-    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (u_stride >= uv_width);
+    ok &= (v_stride >= uv_width);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
@ -67,13 +74,14 @@ static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
-    const uint64_t size = (uint64_t)stride * height;
+    const uint64_t size = MIN_BUFFER_SIZE(width, height, stride);
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
+#undef MIN_BUFFER_SIZE

 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@ -177,7 +177,7 @@ void VP8InitDithering(const WebPDecoderOptions* const options,
        dec->dither_ = 1;
      }
    }
-#if WEBP_DECODER_ABI_VERSION > 0x0203
+#if WEBP_DECODER_ABI_VERSION > 0x0204
    // potentially allow alpha dithering
    dec->alpha_dithering_ = options->alpha_dithering_strength;
    if (dec->alpha_dithering_ > 100) {
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@ -357,30 +357,33 @@ static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
 }

 // Partition #0
-static int CopyParts0Data(WebPIDecoder* const idec) {
+static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
  VP8BitReader* const br = &dec->br_;
-  const size_t psize = br->buf_end_ - br->buf_;
+  const size_t part_size = br->buf_end_ - br->buf_;
  MemBuffer* const mem = &idec->mem_;
  assert(!idec->is_lossless_);
  assert(mem->part0_buf_ == NULL);
-  assert(psize > 0);
-  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  // the following is a format limitation, no need for runtime check:
+  assert(part_size <= mem->part0_size_);
+  if (part_size == 0) {   // can't have zero-size partition #0
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
  if (mem->mode_ == MEM_MODE_APPEND) {
    // We copy and grab ownership of the partition #0 data.
-    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, psize);
+    uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
    if (part0_buf == NULL) {
-      return 0;
+      return VP8_STATUS_OUT_OF_MEMORY;
    }
-    memcpy(part0_buf, br->buf_, psize);
+    memcpy(part0_buf, br->buf_, part_size);
    mem->part0_buf_ = part0_buf;
    br->buf_ = part0_buf;
-    br->buf_end_ = part0_buf + psize;
+    br->buf_end_ = part0_buf + part_size;
  } else {
    // Else: just keep pointers to the partition #0's data in dec_->br_.
  }
-  mem->start_ += psize;
-  return 1;
+  mem->start_ += part_size;
+  return VP8_STATUS_OK;
 }

 static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
@ -414,8 +417,10 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
                                       io->width, io->height);
  VP8InitDithering(params->options, dec);
-  if (!CopyParts0Data(idec)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+
+  dec->status_ = CopyParts0Data(idec);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
  }

  // Finish setting up the decoding parameters. Will call io->setup().
@ -529,6 +534,12 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
  }

  if (!VP8LDecodeImage(dec)) {
+    // The decoding is called after all the data-bytes are aggregated. Change
+    // the error to VP8_BITSTREAM_ERROR in case lossless decoder fails to decode
+    // all the pixels (VP8_STATUS_SUSPENDED).
+    if (dec->status_ == VP8_STATUS_SUSPENDED) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    }
    return ErrorStatusLossless(idec, dec->status_);
  }

--- a/src/dec/io.c
+++ b/src/dec/io.c
@ -322,37 +322,31 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
  size_t tmp_size;
-  int32_t* work;
+  rescaler_t* work;

  tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
  if (has_alpha) {
    tmp_size += work_size * sizeof(*work);
  }
-  p->memory = WebPSafeCalloc(1ULL, tmp_size);
+  p->memory = WebPSafeMalloc(1ULL, tmp_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   buf->y, out_width, out_height, buf->y_stride, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
-                   uv_in_width, uv_out_width,
-                   uv_in_height, uv_out_height,
                   work + work_size + uv_work_size);
  p->emit = EmitRescaledYUV;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     buf->a, out_width, out_height, buf->a_stride, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + work_size + 2 * uv_work_size);
    p->emit_alpha = EmitRescaledAlphaYUV;
    WebPInitAlphaProcessing();
@ -375,9 +369,9 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y, 0);
-    WebPRescalerExportRow(&p->scaler_u, 0);
-    WebPRescalerExportRow(&p->scaler_v, 0);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
@ -425,7 +419,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      const uint32_t alpha_value = p->scaler_a.dst[i];
      dst[4 * i] = alpha_value;
@ -458,7 +452,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a, 0);
+    WebPRescalerExportRow(&p->scaler_a);
    for (i = 0; i < width; ++i) {
      // Fill in the alpha value (converted to 4 bits).
      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
@ -495,7 +489,7 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const int uv_in_width  = (io->mb_w + 1) >> 1;
  const int uv_in_height = (io->mb_h + 1) >> 1;
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
-  int32_t* work;  // rescalers work area
+  rescaler_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
  size_t tmp_size1, tmp_size2, total_size;

@ -506,30 +500,26 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
    tmp_size2 += out_width;
  }
  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeCalloc(1ULL, total_size);
+  p->memory = WebPSafeMalloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
-  work = (int32_t*)p->memory;
+  work = (rescaler_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   tmp + 0 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, out_width, io->mb_h, out_height,
                   work + 0 * work_size);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   tmp + 1 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 1 * work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   tmp + 2 * out_width, out_width, out_height, 0, 1,
-                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;

  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     tmp + 3 * out_width, out_width, out_height, 0, 1,
-                     io->mb_w, out_width, io->mb_h, out_height,
                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
    if (p->output->colorspace == MODE_RGBA_4444 ||
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
+#include "./vp8i.h"
 #include "../utils/bit_reader_inl.h"

 #define USE_GENERIC_TREE
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@ -562,6 +562,7 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
    }
    block->non_zero_y_ = 0;
    block->non_zero_uv_ = 0;
+    block->dither_ = 0;
  }

  if (dec->filter_type_ > 0) {  // store filter info
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@ -31,7 +31,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 1
+#define DEC_REV_VERSION 4

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@ -234,6 +234,7 @@ static int ReadHuffmanCodeLengths(

 End:
  VP8LHuffmanTreeFree(&tree);
+  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
  return ok;
 }

@ -389,13 +390,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
  const int in_height = io->mb_h;
  const int out_height = io->scaled_height;
  const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
-  int32_t* work;        // Rescaler work area.
-  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  rescaler_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = (uint64_t)out_width;
  uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
  const uint64_t memory_size = sizeof(*dec->rescaler) +
                               work_size * sizeof(*work) +
                               scaled_data_size * sizeof(*scaled_data);
-  uint8_t* memory = (uint8_t*)WebPSafeCalloc(memory_size, sizeof(*memory));
+  uint8_t* memory = (uint8_t*)WebPSafeMalloc(memory_size, sizeof(*memory));
  if (memory == NULL) {
    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
    return 0;
@ -405,13 +406,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {

  dec->rescaler = (WebPRescaler*)memory;
  memory += sizeof(*dec->rescaler);
-  work = (int32_t*)memory;
+  work = (rescaler_t*)memory;
  memory += work_size * sizeof(*work);
  scaled_data = (uint32_t*)memory;

  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
-                   out_width, out_height, 0, num_channels,
-                   in_width, out_width, in_height, out_height, work);
+                   out_width, out_height, 0, num_channels, work);
  return 1;
 }

@ -426,7 +426,7 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    WebPMultARGBRow(src, dst_width, 1);
    VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
    ++num_lines_out;
@ -544,7 +544,7 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
  const int dst_width = rescaler->dst_width;
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    WebPMultARGBRow(src, dst_width, 1);
    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
    ++y_pos;
@ -801,6 +801,7 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
      ok = 0;
      goto End;
    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
    ok = !br->error_;
    if (!ok) goto End;
  }
@ -898,7 +899,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
          process_func(dec, row);
        }
      }
-      if (src < src_last) {
+      if (src < src_end) {
        if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
        if (color_cache != NULL) {
          while (last_cached < src) {
@ -918,6 +919,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
      ok = 0;
      goto End;
    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
    ok = !br->error_;
    if (!ok) goto End;
  }
@ -1354,6 +1356,10 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
  // Sanity checks.
  if (dec == NULL) return 0;

+  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  assert(dec->hdr_.htree_groups_ != NULL);
+  assert(dec->hdr_.num_htree_groups_ > 0);
+
  io = dec->io_;
  assert(io != NULL);
  params = (WebPDecParams*)io->opaque;
--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@ -9,6 +9,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:1:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:2:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@ -25,7 +25,7 @@

 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 1
+#define DMUX_REV_VERSION 2

 typedef struct {
  size_t start_;        // start location of the data
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@ -38,6 +38,7 @@ libwebpdsp_avx2_la_CPPFLAGS = $(libwebpdsp_la_CPPFLAGS)
 libwebpdsp_avx2_la_CFLAGS = $(AM_CFLAGS) $(AVX2_FLAGS)

 libwebpdspdecode_sse2_la_SOURCES =
+libwebpdspdecode_sse2_la_SOURCES += alpha_processing_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += dec_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += lossless_sse2.c
 libwebpdspdecode_sse2_la_SOURCES += upsampling_sse2.c
--- a/src/dsp/alpha_processing.c
+++ b/src/dsp/alpha_processing.c
@ -284,15 +284,52 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }

+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);

 //------------------------------------------------------------------------------
 // Init function

+extern void WebPInitAlphaProcessingSSE2(void);
+
+static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
+    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
+
 void WebPInitAlphaProcessing(void) {
+  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  WebPMultARGBRow = MultARGBRow;
  WebPMultRow = MultRow;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+  }
+  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/dsp/alpha_processing_sse2.c
+++ b/src/dsp/alpha_processing_sse2.c
@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 8) {
+      // load 32 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i b0 = _mm_and_si128(a0, a_mask);
+      const __m128i b1 = _mm_and_si128(a1, a_mask);
+      const __m128i c0 = _mm_packs_epi32(b0, b1);
+      const __m128i d0 = _mm_packus_epi16(c0, c0);
+      // store
+      _mm_storel_epi64((__m128i*)&alpha[i], d0);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xff);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingSSE2(void);
+
+void WebPInitAlphaProcessingSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  WebPExtractAlpha = ExtractAlpha;
+#endif
+}
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@ -29,16 +29,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    "cpuid\n"
    "xchg %%edi, %%ebx\n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
+    : "a"(info_type), "c"(0));
 }
-#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
 #elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
@ -55,9 +57,11 @@ static WEBP_INLINE uint64_t xgetbv(void) {
    : "=a"(eax), "=d"(edx) : "c" (ecx));
  return ((uint64_t)edx << 32) | eax;
 }
-#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
 #define xgetbv() _xgetbv(0)
-#elif defined(_M_IX86)
+#elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
  uint32_t eax_, edx_;
  __asm {
@ -118,7 +122,7 @@ static int armCPUInfo(CPUFeature feature) {
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
-#elif defined(__mips__)
+#elif defined(WEBP_USE_MIPS32)
 static int mipsCPUInfo(CPUFeature feature) {
  (void)feature;
  return 1;
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@ -688,7 +688,12 @@ extern void VP8DspInitSSE2(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);

+static volatile VP8CPUInfo dec_last_cpuinfo_used =
+    (VP8CPUInfo)&dec_last_cpuinfo_used;
+
 void VP8DspInit(void) {
+  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  VP8InitClipTables();

  VP8TransformWHT = TransformWHT;
@ -727,5 +732,5 @@ void VP8DspInit(void) {
    }
 #endif
  }
+  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@ -24,24 +24,24 @@

 // Load/Store vertical edge
 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
-  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
-  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"

 #define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
-  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"

 #if !defined(WORK_AROUND_GCC)

--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@ -32,9 +32,14 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) \
    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
 #else
+# define LOCAL_GCC_VERSION 0
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif

+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@ -56,12 +61,19 @@ extern "C" {
 #define WEBP_ANDROID_NEON  // Android targets that might support NEON
 #endif

-#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || defined(__aarch64__)
+// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
+// inline assembly would need to be modified for use with Native Client.
+#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+     defined(__aarch64__)) && !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif

-#if defined(__mips__)
+#if defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
+#if (__mips_isa_rev >= 2)
+#define WEBP_USE_MIPS32_R2
+#endif
 #endif

 typedef enum {
@ -244,6 +256,13 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
    uint8_t* rgba4444, int w, int h, int stride);

+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
+
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@ -692,7 +692,12 @@ extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);

+static volatile VP8CPUInfo enc_last_cpuinfo_used =
+    (VP8CPUInfo)&enc_last_cpuinfo_used;
+
 void VP8EncDspInit(void) {
+  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  VP8DspInit();  // common inverse transforms
  InitTables();

@ -737,5 +742,6 @@ void VP8EncDspInit(void) {
    }
 #endif
  }
+  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }

--- a/src/dsp/enc_mips32.c
+++ b/src/dsp/enc_mips32.c
@ -20,6 +20,10 @@
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"

+#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
+#define WORK_AROUND_GCC
+#endif
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;

@ -30,26 +34,26 @@ static const int kC2 = 35468;
 // TEMP0..TEMP3 - registers for corresponding tmp elements
 // TEMP4..TEMP5 - temporary registers
 #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \
-  "lh      %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lh      %[temp18],      "#B"(%[temp20])                 \n\t"            \
-  "lh      %[temp17],      "#C"(%[temp20])                 \n\t"            \
-  "lh      %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP4"],    %[temp16],      %[temp18]       \n\t"            \
-  "subu    %[temp16],      %[temp16],      %[temp18]       \n\t"            \
-  "mul     %["#TEMP0"],    %[temp17],      %[kC2]          \n\t"            \
-  "mul     %[temp18],      %[temp19],      %[kC1]          \n\t"            \
-  "mul     %[temp17],      %[temp17],      %[kC1]          \n\t"            \
-  "mul     %[temp19],      %[temp19],      %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\n"            \
-  "sra     %[temp18],      %[temp18],      16              \n\n"            \
-  "sra     %[temp17],      %[temp17],      16              \n\n"            \
-  "sra     %[temp19],      %[temp19],      16              \n\n"            \
-  "subu    %["#TEMP2"],    %["#TEMP0"],    %[temp18]       \n\t"            \
-  "addu    %["#TEMP3"],    %[temp17],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP0"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"            \
-  "addu    %["#TEMP1"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP2"],    %[temp16],      %["#TEMP2"]     \n\t"            \
-  "subu    %["#TEMP3"],    %["#TEMP4"],    %["#TEMP3"]     \n\t"
+  "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \
+  "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \
+  "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \
+  "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \
+  "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \
+  "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \
+  "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \
+  "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \
+  "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \
+  "sra     %[temp18],      %[temp18],      16                \n\n"          \
+  "sra     %[temp17],      %[temp17],      16                \n\n"          \
+  "sra     %[temp19],      %[temp19],      16                \n\n"          \
+  "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \
+  "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \
+  "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \
+  "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \
+  "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"

 // macro for one horizontal pass in ITransformOne
 // MUL and STORE macros inlined
@ -57,59 +61,59 @@ static const int kC2 = 35468;
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)            \
-  "addiu   %["#TEMP0"],    %["#TEMP0"],    4               \n\t"            \
-  "addu    %[temp16],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "subu    %[temp17],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "mul     %["#TEMP0"],    %["#TEMP4"],    %[kC2]          \n\t"            \
-  "mul     %["#TEMP8"],    %["#TEMP12"],   %[kC1]          \n\t"            \
-  "mul     %["#TEMP4"],    %["#TEMP4"],    %[kC1]          \n\t"            \
-  "mul     %["#TEMP12"],   %["#TEMP12"],   %[kC2]          \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    16              \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    16              \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    16              \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   16              \n\t"            \
-  "subu    %[temp18],      %["#TEMP0"],    %["#TEMP8"]     \n\t"            \
-  "addu    %[temp19],      %["#TEMP4"],    %["#TEMP12"]    \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP8"],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %["#TEMP12"],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                      \n\t"            \
-  "sra     %["#TEMP0"],    %["#TEMP0"],    3               \n\t"            \
-  "sra     %["#TEMP4"],    %["#TEMP4"],    3               \n\t"            \
-  "sra     %["#TEMP8"],    %["#TEMP8"],    3               \n\t"            \
-  "sra     %["#TEMP12"],   %["#TEMP12"],   3               \n\t"            \
-  "lbu     %[temp16],      "#A"(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      "#B"(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      "#C"(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      "#D"(%[temp20])                 \n\t"            \
-  "addu    %["#TEMP0"],    %[temp16],      %["#TEMP0"]     \n\t"            \
-  "addu    %["#TEMP4"],    %[temp17],      %["#TEMP4"]     \n\t"            \
-  "addu    %["#TEMP8"],    %[temp18],      %["#TEMP8"]     \n\t"            \
-  "addu    %["#TEMP12"],   %[temp19],      %["#TEMP12"]    \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    $zero           \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    $zero           \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    $zero           \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   $zero           \n\t"            \
-  "movn    %["#TEMP0"],    $zero,          %[temp16]       \n\t"            \
-  "movn    %["#TEMP4"],    $zero,          %[temp17]       \n\t"            \
-  "movn    %["#TEMP8"],    $zero,          %[temp18]       \n\t"            \
-  "movn    %["#TEMP12"],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255             \n\t"            \
-  "slt     %[temp16],      %["#TEMP0"],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %["#TEMP4"],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %["#TEMP8"],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %["#TEMP12"],   %[temp20]       \n\t"            \
-  "movz    %["#TEMP0"],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %["#TEMP4"],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                      \n\t"            \
-  "movz    %["#TEMP8"],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %["#TEMP12"],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %["#TEMP0"],    "#A"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP4"],    "#B"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP8"],    "#C"(%[temp16])                 \n\t"            \
-  "sb      %["#TEMP12"],   "#D"(%[temp16])                 \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)              \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4             \n\t"            \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]        \n\t"            \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]        \n\t"            \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]        \n\t"            \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16            \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16            \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16            \n\t"            \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]  \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]       \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]       \n\t"            \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]       \n\t"            \
+  "lw      %[temp20],      0(%[args])                        \n\t"            \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3             \n\t"            \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3             \n\t"            \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3             \n\t"            \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3             \n\t"            \
+  "lbu     %[temp16],      " #A "(%[temp20])                 \n\t"            \
+  "lbu     %[temp17],      " #B "(%[temp20])                 \n\t"            \
+  "lbu     %[temp18],      " #C "(%[temp20])                 \n\t"            \
+  "lbu     %[temp19],      " #D "(%[temp20])                 \n\t"            \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]   \n\t"            \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]   \n\t"            \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]   \n\t"            \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]  \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero           \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero           \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero           \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero           \n\t"            \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]       \n\t"            \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]       \n\t"            \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]       \n\t"            \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]       \n\t"            \
+  "addiu   %[temp20],      $zero,          255               \n\t"            \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]       \n\t"            \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]       \n\t"            \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]       \n\t"            \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]       \n\t"            \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]       \n\t"            \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]       \n\t"            \
+  "lw      %[temp16],      8(%[args])                        \n\t"            \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]       \n\t"            \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]       \n\t"            \
+  "sb      %[" #TEMP0 "],    " #A "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP4 "],    " #B "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP8 "],    " #C "(%[temp16])               \n\t"            \
+  "sb      %[" #TEMP12 "],   " #D "(%[temp16])               \n\t"

 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@ -160,9 +164,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
 // K - offset in bytes (kZigzag[n] * 4)
 // N - offset in bytes (n * 2)
 #define QUANTIZE_ONE(J, K, N)                                               \
-  "lh           %[temp0],       "#J"(%[ppin])                       \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppsharpen])                  \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppzthresh])                  \n\t"   \
+  "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \
  "sra          %[sign],        %[temp0],           15              \n\t"   \
  "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \
  "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \
@ -171,9 +175,9 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "addiu        %[temp5],       $zero,              0               \n\t"   \
  "addiu        %[level],       $zero,              0               \n\t"   \
  "beqz         %[temp4],       2f                                  \n\t"   \
-  "lhu          %[temp1],       "#J"(%[ppiq])                       \n\t"   \
-  "lw           %[temp2],       "#K"(%[ppbias])                     \n\t"   \
-  "lhu          %[temp3],       "#J"(%[ppq])                        \n\t"   \
+  "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \
+  "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \
+  "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \
  "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \
  "addu         %[level],       %[level],           %[temp2]        \n\t"   \
  "sra          %[level],       %[level],           17              \n\t"   \
@ -183,8 +187,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
  "subu         %[level],       %[level],           %[sign]         \n\t"   \
  "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \
 "2:                                                                 \n\t"   \
-  "sh           %[temp5],       "#J"(%[ppin])                       \n\t"   \
-  "sh           %[level],       "#N"(%[pout])                       \n\t"
+  "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
+  "sh           %[level],       " #N "(%[pout])                     \n\t"

 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
@ -245,14 +249,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
 #define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  "#A"(%[a])              \n\t"                \
-  "lbu    %[temp1],  "#B"(%[a])              \n\t"                \
-  "lbu    %[temp2],  "#C"(%[a])              \n\t"                \
-  "lbu    %[temp3],  "#D"(%[a])              \n\t"                \
-  "lbu    %[temp4],  "#A"(%[b])              \n\t"                \
-  "lbu    %[temp5],  "#B"(%[b])              \n\t"                \
-  "lbu    %[temp6],  "#C"(%[b])              \n\t"                \
-  "lbu    %[temp7],  "#D"(%[b])              \n\t"                \
+  "lbu    %[temp0],  " #A "(%[a])            \n\t"                \
+  "lbu    %[temp1],  " #B "(%[a])            \n\t"                \
+  "lbu    %[temp2],  " #C "(%[a])            \n\t"                \
+  "lbu    %[temp3],  " #D "(%[a])            \n\t"                \
+  "lbu    %[temp4],  " #A "(%[b])            \n\t"                \
+  "lbu    %[temp5],  " #B "(%[b])            \n\t"                \
+  "lbu    %[temp6],  " #C "(%[b])            \n\t"                \
+  "lbu    %[temp7],  " #D "(%[b])            \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
@ -269,14 +273,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  "#E"(%[tmp])            \n\t"                \
-  "sw     %[temp2],  "#H"(%[tmp])            \n\t"                \
-  "sw     %[temp8],  "#F"(%[tmp])            \n\t"                \
-  "sw     %[temp0],  "#G"(%[tmp])            \n\t"                \
-  "sw     %[temp1],  "#E1"(%[tmp])           \n\t"                \
-  "sw     %[temp3],  "#H1"(%[tmp])           \n\t"                \
-  "sw     %[temp6],  "#F1"(%[tmp])           \n\t"                \
-  "sw     %[temp4],  "#G1"(%[tmp])           \n\t"
+  "sw     %[temp7],  " #E "(%[tmp])          \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])          \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])          \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])          \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])         \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])         \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])         \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])         \n\t"

 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@ -291,10 +295,10 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
 // A1..D1 - offsets in bytes to load second results from tmp buffer
 // E..H - offsets in bytes to load from w buffer
 #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \
-  "lw     %[temp0],  "#A1"(%[tmp])           \n\t"                \
-  "lw     %[temp1],  "#C1"(%[tmp])           \n\t"                \
-  "lw     %[temp2],  "#B1"(%[tmp])           \n\t"                \
-  "lw     %[temp3],  "#D1"(%[tmp])           \n\t"                \
+  "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \
+  "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \
+  "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \
+  "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -315,18 +319,18 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \
  "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \
-  "lhu    %[temp4],  "#E"(%[w])              \n\t"                \
-  "lhu    %[temp5],  "#F"(%[w])              \n\t"                \
-  "lhu    %[temp6],  "#G"(%[w])              \n\t"                \
-  "lhu    %[temp7],  "#H"(%[w])              \n\t"                \
+  "lhu    %[temp4],  " #E "(%[w])            \n\t"                \
+  "lhu    %[temp5],  " #F "(%[w])            \n\t"                \
+  "lhu    %[temp6],  " #G "(%[w])            \n\t"                \
+  "lhu    %[temp7],  " #H "(%[w])            \n\t"                \
  "madd   %[temp4],  %[temp3]                \n\t"                \
  "madd   %[temp5],  %[temp1]                \n\t"                \
  "madd   %[temp6],  %[temp0]                \n\t"                \
  "madd   %[temp7],  %[temp8]                \n\t"                \
-  "lw     %[temp0],  "#A"(%[tmp])            \n\t"                \
-  "lw     %[temp1],  "#C"(%[tmp])            \n\t"                \
-  "lw     %[temp2],  "#B"(%[tmp])            \n\t"                \
-  "lw     %[temp3],  "#D"(%[tmp])            \n\t"                \
+  "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \
+  "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \
+  "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \
+  "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \
  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
  "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \
@ -403,71 +407,71 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3) \
-  "lw     %["#TEMP1"],  0(%[args])                     \n\t"    \
-  "lw     %["#TEMP2"],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    "#A"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#A"(%["#TEMP2"])              \n\t"    \
-  "lbu    %[temp18],    "#B"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#B"(%["#TEMP2"])              \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]        \n\t"    \
-  "lbu    %[temp16],    "#C"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp17],    "#C"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP0"],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    "#D"(%["#TEMP1"])              \n\t"    \
-  "lbu    %[temp19],    "#D"(%["#TEMP2"])              \n\t"    \
-  "subu   %["#TEMP1"],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %["#TEMP2"],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %["#TEMP3"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "subu   %["#TEMP2"],  %[temp20],    %["#TEMP2"]      \n\t"    \
-  "addu   %[temp20],    %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "subu   %["#TEMP0"],  %["#TEMP0"],  %["#TEMP1"]      \n\t"    \
-  "mul    %[temp16],    %["#TEMP2"],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %["#TEMP2"],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %["#TEMP0"],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %["#TEMP0"],  %[c2217]         \n\t"    \
-  "addu   %["#TEMP1"],  %["#TEMP3"],  %[temp20]        \n\t"    \
-  "subu   %[temp20],    %["#TEMP3"],  %[temp20]        \n\t"    \
-  "sll    %["#TEMP0"],  %["#TEMP1"],  3                \n\t"    \
-  "sll    %["#TEMP2"],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812             \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937              \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]        \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]        \n\t"    \
-  "sra    %["#TEMP1"],  %[temp16],    9                \n\t"    \
-  "sra    %["#TEMP3"],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3)   \
+  "lw     %[" #TEMP1 "],  0(%[args])                     \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                     \n\t"    \
+  "lbu    %[temp16],    " #A "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #A "(%[" #TEMP2 "])            \n\t"    \
+  "lbu    %[temp18],    " #B "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #B "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]          \n\t"    \
+  "lbu    %[temp16],    " #C "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp17],    " #C "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]        \n\t"    \
+  "lbu    %[temp18],    " #D "(%[" #TEMP1 "])            \n\t"    \
+  "lbu    %[temp19],    " #D "(%[" #TEMP2 "])            \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]        \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]        \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]    \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]  \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]         \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]         \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]         \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]         \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]      \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]        \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3              \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812               \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]          \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]          \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                \n\t"

 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
 // A..D - offsets in bytes to store to out buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)  \
-  "addu   %[temp16],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "subu   %[temp19],    %["#TEMP0"],  %["#TEMP12"]     \n\t"    \
-  "addu   %[temp17],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "subu   %[temp18],    %["#TEMP4"],  %["#TEMP8"]      \n\t"    \
-  "mul    %["#TEMP8"],  %[temp19],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP12"], %[temp18],    %[c2217]         \n\t"    \
-  "mul    %["#TEMP4"],  %[temp19],    %[c5352]         \n\t"    \
-  "mul    %[temp18],    %[temp18],    %[c5352]         \n\t"    \
-  "addiu  %[temp16],    %[temp16],    7                \n\t"    \
-  "addu   %["#TEMP0"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP0"],  %["#TEMP0"],  4                \n\t"    \
-  "addu   %["#TEMP12"], %["#TEMP12"], %["#TEMP4"]      \n\t"    \
-  "subu   %["#TEMP4"],  %[temp16],    %[temp17]        \n\t"    \
-  "sra    %["#TEMP4"],  %["#TEMP4"],  4                \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  30000            \n\t"    \
-  "addiu  %["#TEMP12"], %["#TEMP12"], 12000            \n\t"    \
-  "addiu  %["#TEMP8"],  %["#TEMP8"],  21000            \n\t"    \
-  "subu   %["#TEMP8"],  %["#TEMP8"],  %[temp18]        \n\t"    \
-  "sra    %["#TEMP12"], %["#TEMP12"], 16               \n\t"    \
-  "sra    %["#TEMP8"],  %["#TEMP8"],  16               \n\t"    \
-  "addiu  %[temp16],    %["#TEMP12"], 1                \n\t"    \
-  "movn   %["#TEMP12"], %[temp16],    %[temp19]        \n\t"    \
-  "sh     %["#TEMP0"],  "#A"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP4"],  "#C"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP8"],  "#D"(%[temp20])                \n\t"    \
-  "sh     %["#TEMP12"], "#B"(%[temp20])                \n\t"
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \
+  "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \
+  "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \
+  "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \
+  "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \
+  "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \
+  "addiu  %[temp16],    %[temp16],    7                  \n\t"    \
+  "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \
+  "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \
+  "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \
+  "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \
+  "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \
+  "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \
+  "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \
+  "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \
+  "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \
+  "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \
+  "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \
+  "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
+  "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"

 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@ -618,14 +622,14 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
 }

 #define GET_SSE_INNER(A, B, C, D)                               \
-  "lbu     %[temp0],    "#A"(%[a])                   \n\t"      \
-  "lbu     %[temp1],    "#A"(%[b])                   \n\t"      \
-  "lbu     %[temp2],    "#B"(%[a])                   \n\t"      \
-  "lbu     %[temp3],    "#B"(%[b])                   \n\t"      \
-  "lbu     %[temp4],    "#C"(%[a])                   \n\t"      \
-  "lbu     %[temp5],    "#C"(%[b])                   \n\t"      \
-  "lbu     %[temp6],    "#D"(%[a])                   \n\t"      \
-  "lbu     %[temp7],    "#D"(%[b])                   \n\t"      \
+  "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
+  "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \
+  "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \
+  "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \
+  "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \
+  "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \
+  "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \
+  "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \
  "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \
  "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \
  "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \
@ -641,6 +645,7 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
  GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
  GET_SSE_INNER(D, D + 1, D + 2, D + 3)

+#if !defined(WORK_AROUND_GCC)
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  int count;
  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@ -742,6 +747,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return count;
 }

+#endif  // WORK_AROUND_GCC
+
 #undef GET_SSE_MIPS32
 #undef GET_SSE_MIPS32_INNER

@ -759,9 +766,11 @@ void VP8EncDspInitMIPS32(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8FTransform = FTransform;
+#if !defined(WORK_AROUND_GCC)
  VP8SSE16x16 = SSE16x16;
  VP8SSE8x8 = SSE8x8;
  VP8SSE16x8 = SSE16x8;
  VP8SSE4x4 = SSE4x4;
+#endif
 #endif  // WEBP_USE_MIPS32
 }
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@ -253,7 +253,7 @@ static void ITransform(const uint8_t* ref,

 // Load all 4x4 pixels into a single uint8x16_t variable.
 static uint8x16_t Load4x4(const uint8_t* src) {
-  uint32x4_t out = { 0, 0, 0, 0 };
+  uint32x4_t out = vdupq_n_u32(0);
  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
@ -929,7 +929,7 @@ static int SumToInt(uint32x4_t sum) {
 }

 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 16; ++y) {
    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@ -938,7 +938,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
 }

 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
@ -947,7 +947,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
 }

 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  uint32x4_t sum = { 0, 0, 0, 0 };
+  uint32x4_t sum = vdupq_n_u32(0);
  int y;
  for (y = 0; y < 8; ++y) {
    const uint8x8_t a0 = vld1_u8(a + y * BPS);
@ -970,9 +970,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {

 //------------------------------------------------------------------------------

-// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable
-// in iOS/arm64 builds. Disable this function in those cases.
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+// Compilation with gcc-4.6.x is problematic for now.
+#if !defined(WORK_AROUND_GCC)

 static int16x8_t Quantize(int16_t* const in,
                          const VP8Matrix* const mtx, int offset) {
@ -1002,27 +1001,46 @@ static int16x8_t Quantize(int16_t* const in,
 }

 static const uint8_t kShuffles[4][8] = {
- { 0,   1,  2,  3,  8,  9, 16, 17 },
- { 10, 11,  4,  5,  6,  7, 12, 13 },
- { 18, 19, 24, 25, 26, 27, 20, 21 },
- { 14, 15, 22, 23, 28, 29, 30, 31 }
+  { 0,   1,  2,  3,  8,  9, 16, 17 },
+  { 10, 11,  4,  5,  6,  7, 12, 13 },
+  { 18, 19, 24, 25, 26, 27, 20, 21 },
+  { 14, 15, 22, 23, 28, 29, 30, 31 }
 };

 static int QuantizeBlock(int16_t in[16], int16_t out[16],
                         const VP8Matrix* const mtx) {
  const int16x8_t out0 = Quantize(in, mtx, 0);
  const int16x8_t out1 = Quantize(in, mtx, 8);
+  uint8x8x4_t shuffles;
+  // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+  // non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+  uint8x16x2_t all_out;
+  INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
+  INIT_VECTOR4(shuffles,
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
+#else
  uint8x8x4_t all_out;
  INIT_VECTOR4(all_out,
               vreinterpret_u8_s16(vget_low_s16(out0)),
               vreinterpret_u8_s16(vget_high_s16(out0)),
               vreinterpret_u8_s16(vget_low_s16(out1)),
               vreinterpret_u8_s16(vget_high_s16(out1)));
+  INIT_VECTOR4(shuffles,
+               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
+               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+#endif
  // Zigzag reordering
-  vst1_u8((uint8_t*)(out +  0), vtbl4_u8(all_out, vld1_u8(kShuffles[0])));
-  vst1_u8((uint8_t*)(out +  4), vtbl4_u8(all_out, vld1_u8(kShuffles[1])));
-  vst1_u8((uint8_t*)(out +  8), vtbl4_u8(all_out, vld1_u8(kShuffles[2])));
-  vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
+  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
+  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
+  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
+  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
  // test zeros
  if (*(uint64_t*)(out +  0) != 0) return 1;
  if (*(uint64_t*)(out +  4) != 0) return 1;
@ -1031,7 +1049,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return 0;
 }

-#endif   // !WORK_AROUND_GCC && !__aarch64__
+#endif   // !WORK_AROUND_GCC

 #endif   // WEBP_USE_NEON

@ -1054,7 +1072,7 @@ void VP8EncDspInitNEON(void) {
  VP8SSE16x8 = SSE16x8;
  VP8SSE8x8 = SSE8x8;
  VP8SSE4x4 = SSE4x4;
-#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__))
+#if !defined(WORK_AROUND_GCC)
  VP8EncQuantizeBlock = QuantizeBlock;
 #endif
 #endif   // WEBP_USE_NEON
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@ -450,12 +450,21 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }

-static WEBP_INLINE int Sub3(int a, int b, int c) {
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
  const int pb = b - c;
  const int pa = a - c;
  return abs(pb) - abs(pa);
 }

+#undef LOCAL_INLINE
+
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  const int pa_minus_pb =
      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
@ -1169,7 +1178,7 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
      data += remaining_width;
    }
    ++y;
-    if ((y & mask) == 0) pred_row += tiles_per_row;;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
  }
 }

@ -1581,7 +1590,12 @@ extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPS32(void);

+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_last_cpuinfo_used;
+
 void VP8LDspInit(void) {
+  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));

  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
@ -1625,6 +1639,7 @@ void VP8LDspInit(void) {
    }
 #endif
  }
+  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@ -56,24 +56,20 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;

 // Expose some C-only fallback functions
-extern void VP8LTransformColor_C(const VP8LMultipliers* const m,
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                 uint32_t* data, int num_pixels);
-extern void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                        uint32_t* data, int num_pixels);

-extern void VP8LConvertBGRAToRGB_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
-                                    int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
-                                        int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
-                                      int num_pixels, uint8_t* dst);
-extern void VP8LConvertBGRAToBGR_C(const uint32_t* src,
-                                   int num_pixels, uint8_t* dst);
-extern void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data,
-                                              int num_pixels);
-extern void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);

 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
--- a/src/dsp/lossless_mips32.c
+++ b/src/dsp/lossless_mips32.c
@ -285,28 +285,28 @@ static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
 // literal_ and successive histograms could be unaligned
 // so we must use ulw and usw
 #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2)           \
-    "ulw    %[temp0], "#A"(%["#P0"])        \n\t"       \
-    "ulw    %[temp1], "#B"(%["#P0"])        \n\t"       \
-    "ulw    %[temp2], "#C"(%["#P0"])        \n\t"       \
-    "ulw    %[temp3], "#D"(%["#P0"])        \n\t"       \
-    "ulw    %[temp4], "#A"(%["#P1"])        \n\t"       \
-    "ulw    %[temp5], "#B"(%["#P1"])        \n\t"       \
-    "ulw    %[temp6], "#C"(%["#P1"])        \n\t"       \
-    "ulw    %[temp7], "#D"(%["#P1"])        \n\t"       \
+    "ulw    %[temp0], " #A "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp1], " #B "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp2], " #C "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp3], " #D "(%[" #P0 "])    \n\t"       \
+    "ulw    %[temp4], " #A "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp5], " #B "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp6], " #C "(%[" #P1 "])    \n\t"       \
+    "ulw    %[temp7], " #D "(%[" #P1 "])    \n\t"       \
    "addu   %[temp4], %[temp4],   %[temp0]  \n\t"       \
    "addu   %[temp5], %[temp5],   %[temp1]  \n\t"       \
    "addu   %[temp6], %[temp6],   %[temp2]  \n\t"       \
    "addu   %[temp7], %[temp7],   %[temp3]  \n\t"       \
-    "addiu  %["#P0"],  %["#P0"],  16        \n\t"       \
-  ".if "#E" == 1                            \n\t"       \
-    "addiu  %["#P1"],  %["#P1"],  16        \n\t"       \
+    "addiu  %[" #P0 "],  %[" #P0 "],  16    \n\t"       \
+  ".if " #E " == 1                          \n\t"       \
+    "addiu  %[" #P1 "],  %[" #P1 "],  16    \n\t"       \
  ".endif                                   \n\t"       \
-    "usw    %[temp4], "#A"(%["#P2"])        \n\t"       \
-    "usw    %[temp5], "#B"(%["#P2"])        \n\t"       \
-    "usw    %[temp6], "#C"(%["#P2"])        \n\t"       \
-    "usw    %[temp7], "#D"(%["#P2"])        \n\t"       \
-    "addiu  %["#P2"], %["#P2"],   16        \n\t"       \
-    "bne    %["#P0"], %[LoopEnd], 1b        \n\t"       \
+    "usw    %[temp4], " #A "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp5], " #B "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp6], " #C "(%[" #P2 "])    \n\t"       \
+    "usw    %[temp7], " #D "(%[" #P2 "])    \n\t"       \
+    "addiu  %[" #P2 "], %[" #P2 "],   16    \n\t"       \
+    "bne    %[" #P0 "], %[LoopEnd], 1b      \n\t"       \
    ".set   pop                             \n\t"       \

 #define ASM_END_COMMON_0                                \
--- a/src/dsp/lossless_neon.c
+++ b/src/dsp/lossless_neon.c
@ -259,20 +259,45 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 //------------------------------------------------------------------------------
 // Subtract-Green Transform

-// vtbl? are unavailable in iOS/arm64 builds.
-#if !defined(__aarch64__)
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif

-// 255 = byte will be zero'd
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };

+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens =
-        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
-                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
@ -281,19 +306,21 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {

 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
  for (; argb_data < end; argb_data += 4) {
    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens =
-        vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
-                    vtbl1_u8(vget_high_u8(argb), shuffle));
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
  }
  // fallthrough and finish off with plain-C
  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
 }

-#endif   // !__aarch64__
+#undef USE_VTBLQ

 #endif   // USE_INTRINSICS

@ -320,11 +347,9 @@ void VP8LDspInitNEON(void) {
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;

-#if !defined(__aarch64__)
  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
 #endif
-#endif

 #endif   // WEBP_USE_NEON
 }
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@ -189,7 +189,12 @@ const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);

+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
+
 void WebPInitUpsamplers(void) {
+  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
+
 #ifdef FANCY_UPSAMPLING
  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
@ -217,6 +222,7 @@ void WebPInitUpsamplers(void) {
 #endif
  }
 #endif  // FANCY_UPSAMPLING
+  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }

 //------------------------------------------------------------------------------
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@ -123,7 +123,12 @@ WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersMIPS32(void);

+static volatile VP8CPUInfo yuv_last_cpuinfo_used =
+    (VP8CPUInfo)&yuv_last_cpuinfo_used;
+
 void WebPInitSamplers(void) {
+  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
@ -149,6 +154,7 @@ void WebPInitSamplers(void) {
    }
 #endif  // WEBP_USE_MIPS32
  }
+  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }

 //-----------------------------------------------------------------------------
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@ -47,12 +47,11 @@

 static int EncodeLossless(const uint8_t* const data, int width, int height,
                          int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw,
+                          VP8LBitWriter* const bw,
                          WebPAuxStats* const stats) {
  int ok = 0;
  WebPConfig config;
  WebPPicture picture;
-  VP8LBitWriter tmp_bw;

  WebPPictureInit(&picture);
  picture.width = width;
@ -84,16 +83,15 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
  config.quality = 8.f * effort_level;
  assert(config.quality >= 0 && config.quality <= 100.f);

-  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
  WebPPictureFree(&picture);
-  if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterDestroy(bw);
+    return 0;
  }
-  VP8LBitWriterDestroy(&tmp_bw);
-  return ok && !bw->error_;
+  return 1;
+
 }

 // -----------------------------------------------------------------------------
@ -115,8 +113,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
  const uint8_t* alpha_src;
  WebPFilterFunc filter_func;
  uint8_t header;
-  size_t expected_size;
  const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;

  assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
  assert(filter >= 0 && filter < WEBP_FILTER_LAST);
@ -125,15 +125,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
  assert(sizeof(header) == ALPHA_HEADER_LEN);
  // TODO(skal): have a common function and #define's to validate alpha params.

-  expected_size =
-      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
-                                       : (data_size >> 5);
-  header = method | (filter << 2);
-  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
-
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
-
  filter_func = WebPFilters[filter];
  if (filter_func != NULL) {
    filter_func(data, width, height, width, tmp_alpha);
@ -142,14 +133,42 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
    alpha_src = data;
  }

-  if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
-  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterDestroy(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterDestroy(&tmp_bw);
+      return 0;
+    }
  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterDestroy(&tmp_bw);
+  }
+  ok = ok && !result->bw.error_;
  result->score = VP8BitWriterSize(&result->bw);
  return ok;
 }
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@ -141,7 +141,11 @@ static void MergeHistograms(const VP8Histogram* const in,

 static void AssignSegments(VP8Encoder* const enc,
                           const int alphas[MAX_ALPHA + 1]) {
-  const int nb = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
  int centers[NUM_MB_SEGMENTS];
  int weighted_average = 0;
  int map[MAX_ALPHA + 1];
--- a/src/enc/config.c
+++ b/src/enc/config.c
@ -111,7 +111,11 @@ int WebPValidateConfig(const WebPConfig* config) {
    return 0;
  if (config->show_compressed < 0 || config->show_compressed > 1)
    return 0;
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+  if (config->preprocessing < 0 || config->preprocessing > 7)
+#else
  if (config->preprocessing < 0 || config->preprocessing > 3)
+#endif
    return 0;
  if (config->partitions < 0 || config->partitions > 3)
    return 0;
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@ -42,7 +42,7 @@ typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
                                         VP8Residual* const res);
 extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;

-extern void VP8SetResidualCoeffsInit(void);  // must be called first
+void VP8SetResidualCoeffsInit(void);  // must be called first

 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);

@ -59,7 +59,7 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
 typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
 extern VP8GetResidualCostFunc VP8GetResidualCost;

-extern void VP8GetResidualCostInit(void);  // must be called first
+void VP8GetResidualCostInit(void);  // must be called first

 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@ -508,7 +508,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
      }
      case 7: *info = mb->alpha_; break;
      default: *info = 0; break;
-    };
+    }
  }
 #if SEGMENT_VISU  // visualize segments and prediction modes
  SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@ -20,6 +20,9 @@
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"

+#define ALIGN_CST 15
+#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+
 #define MAX_COST 1.e38

 // Number of partitions for the three dominant (literal, red and blue) symbol
@ -101,9 +104,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
  int i;
  VP8LHistogramSet* set;
-  const size_t total_size = sizeof(*set)
-                            + sizeof(*set->histograms) * size
-                            + (size_t)VP8LGetHistogramSize(cache_bits) * size;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  const size_t total_size =
+      sizeof(*set) + size * (sizeof(*set->histograms) + histo_size + ALIGN_CST);
  uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
  if (memory == NULL) return NULL;

@ -114,12 +117,12 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
  set->max_size = size;
  set->size = size;
  for (i = 0; i < size; ++i) {
+    memory = (uint8_t*)DO_ALIGN(memory);
    set->histograms[i] = (VP8LHistogram*)memory;
    // literal_ won't necessary be aligned.
    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
    VP8LHistogramInit(set->histograms[i], cache_bits);
-    // There's no padding/alignment between successive histograms.
-    memory += VP8LGetHistogramSize(cache_bits);
+    memory += histo_size;
  }
  return set;
 }
--- a/src/enc/picture_csp.c
+++ b/src/enc/picture_csp.c
--- a/src/enc/picture_rescale.c
+++ b/src/enc/picture_rescale.c
@ -175,17 +175,13 @@ static void RescalePlane(const uint8_t* src,
                         int src_width, int src_height, int src_stride,
                         uint8_t* dst,
                         int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work,
+                         rescaler_t* const work,
                         int num_channels) {
  WebPRescaler rescaler;
  int y = 0;
  WebPRescalerInit(&rescaler, src_width, src_height,
                   dst, dst_width, dst_height, dst_stride,
-                   num_channels,
-                   src_width, dst_width,
-                   src_height, dst_height,
-                   work);
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+                   num_channels, work);
  while (y < src_height) {
    y += WebPRescalerImport(&rescaler, src_height - y,
                            src + y * src_stride, src_stride);
@ -209,7 +205,7 @@ static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  WebPPicture tmp;
  int prev_width, prev_height;
-  int32_t* work;
+  rescaler_t* work;

  if (pic == NULL) return 0;
  prev_width = pic->width;
@ -231,7 +227,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  if (!WebPPictureAlloc(&tmp)) return 0;

  if (!pic->use_argb) {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
@ -259,7 +255,7 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                 tmp.v,
                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
  } else {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    work = (rescaler_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
    if (work == NULL) {
      WebPPictureFree(&tmp);
      return 0;
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 1
+#define ENC_REV_VERSION 4

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@ -457,10 +457,10 @@ struct VP8Encoder {
  VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
  uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
  uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t   *y_top_;     // top luma samples.
-  uint8_t   *uv_top_;    // top u/v samples.
+  uint8_t*   y_top_;     // top luma samples.
+  uint8_t*   uv_top_;    // top u/v samples.
                         // U and V are packed into 16 bytes (8 U + 8 V)
-  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };

 //------------------------------------------------------------------------------
@ -571,7 +571,7 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);

 //------------------------------------------------------------------------------

-#if WEBP_ENCODER_ABI_VERSION <= 0x0202
+#if WEBP_ENCODER_ABI_VERSION <= 0x0203
 void WebPMemoryWriterClear(WebPMemoryWriter* writer);
 #endif

--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@ -1081,6 +1081,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
    int y;
    err = AllocateTransformBuffer(enc, width, height);
    if (err != VP8_ENC_OK) goto Error;
+    assert(enc->argb_ != NULL);
    for (y = 0; y < height; ++y) {
      memcpy(enc->argb_ + y * width,
             picture->argb + y * picture->argb_stride,
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@ -326,18 +326,26 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {

  if (!config->lossless) {
    VP8Encoder* enc = NULL;
-    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
      // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (config->preprocessing & 4) {
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+        if (!WebPPictureSmartARGBToYUVA(pic)) {
+          return 0;
+        }
+#endif
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
      }
    }

--- a/src/mux/Makefile.am
+++ b/src/mux/Makefile.am
@ -12,6 +12,6 @@ libwebpmuxinclude_HEADERS += ../webp/mux_types.h
 libwebpmuxinclude_HEADERS += ../webp/types.h

 libwebpmux_la_LIBADD = ../libwebp.la
-libwebpmux_la_LDFLAGS = -no-undefined -version-info 1:1:0
+libwebpmux_la_LDFLAGS = -no-undefined -version-info 1:2:0
 libwebpmuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpmux.pc
--- a/src/mux/muxi.h
+++ b/src/mux/muxi.h
@ -28,7 +28,7 @@ extern "C" {

 #define MUX_MAJ_VERSION 0
 #define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 1
+#define MUX_REV_VERSION 2

 // Chunk object.
 typedef struct WebPChunk WebPChunk;
--- a/src/utils/bit_reader.c
+++ b/src/utils/bit_reader.c
@ -105,9 +105,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 //------------------------------------------------------------------------------
 // VP8LBitReader

-#define LBITS 64      // Number of bits prefetched.
-#define WBITS 32      // Minimum number of bytes needed after VP8LFillBitWindow.
-#define LOG8_WBITS 4  // Number of bytes needed to store WBITS bits.
+#define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.

 #if !defined(WEBP_FORCE_ALIGNED) && \
    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
@ -151,16 +149,6 @@ void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
  br->buf_ = start;
 }

-// Special version that assumes br->pos_ <= br_len_.
-static int IsEndOfStreamSpecial(const VP8LBitReader* const br) {
-  assert(br->pos_ <= br->len_);
-  return br->pos_ == br->len_ && br->bit_pos_ >= LBITS;
-}
-
-static int IsEndOfStream(const VP8LBitReader* const br) {
-  return (br->pos_ > br->len_) || IsEndOfStreamSpecial(br);
-}
-
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                            const uint8_t* const buf, size_t len) {
  assert(br != NULL);
@ -168,38 +156,39 @@ void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
  br->buf_ = buf;
  br->len_ = len;
-  br->eos_ = IsEndOfStream(br);
+  // pos_ > len_ should be considered a param error.
+  br->error_ = (br->pos_ > br->len_);
+  br->eos_ = br->error_ || VP8LIsEndOfStream(br);
 }

-// If not at EOS, reload up to LBITS byte-by-byte
+// If not at EOS, reload up to VP8L_LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
    br->val_ >>= 8;
-    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (LBITS - 8);
+    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
    ++br->pos_;
    br->bit_pos_ -= 8;
  }
+  br->eos_ = VP8LIsEndOfStream(br);
 }

-void VP8LFillBitWindow(VP8LBitReader* const br) {
-  if (br->bit_pos_ >= WBITS) {
-    // TODO(jzern): given the fixed read size it may be possible to force
-    //              alignment in this block.
+void VP8LDoFillBitWindow(VP8LBitReader* const br) {
+  assert(br->bit_pos_ >= VP8L_WBITS);
+  // TODO(jzern): given the fixed read size it may be possible to force
+  //              alignment in this block.
 #if defined(VP8L_USE_UNALIGNED_LOAD)
-    if (br->pos_ + sizeof(br->val_) < br->len_) {
-      br->val_ >>= WBITS;
-      br->bit_pos_ -= WBITS;
-      // The expression below needs a little-endian arch to work correctly.
-      // This gives a large speedup for decoding speed.
-      br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
-                  (LBITS - WBITS);
-      br->pos_ += LOG8_WBITS;
-      return;
-    }
-#endif
-    ShiftBytes(br);       // Slow path.
-    br->eos_ = IsEndOfStreamSpecial(br);
+  if (br->pos_ + sizeof(br->val_) < br->len_) {
+    br->val_ >>= VP8L_WBITS;
+    br->bit_pos_ -= VP8L_WBITS;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                (VP8L_LBITS - VP8L_WBITS);
+    br->pos_ += VP8L_LOG8_WBITS;
+    return;
  }
+#endif
+  ShiftBytes(br);       // Slow path.
 }

 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
@ -210,8 +199,6 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
        (uint32_t)(br->val_ >> br->bit_pos_) & kBitMask[n_bits];
    const int new_bits = br->bit_pos_ + n_bits;
    br->bit_pos_ = new_bits;
-    // If this read is going to cross the read buffer, set the eos flag.
-    br->eos_ = IsEndOfStreamSpecial(br);
    ShiftBytes(br);
    return val;
  } else {
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@ -107,6 +107,9 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 // maximum number of bits (inclusive) the bit-reader can handle:
 #define VP8L_MAX_NUM_BIT_READ 24

+#define VP8L_LBITS 64  // Number of bits prefetched.
+#define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
+
 typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.

 typedef struct {
@ -138,14 +141,26 @@ static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
  return (uint32_t)(br->val_ >> br->bit_pos_);
 }

+// Returns true if there was an attempt at reading bit past the end of
+// the buffer. Doesn't set br->eos_ flag.
+static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
+  assert(br->pos_ <= br->len_);
+  return (br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS);
+}
+
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
  br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
 }

 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
-void VP8LFillBitWindow(VP8LBitReader* const br);
+// Speed critical, but infrequent part of the code can be non-inlined.
+extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
+static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
+}

 #ifdef __cplusplus
 }    // extern "C"
--- a/src/utils/bit_reader_inl.h
+++ b/src/utils/bit_reader_inl.h
@ -24,6 +24,7 @@
 #include <string.h>  // memcpy
 #endif

+#include "../dsp/dsp.h"
 #include "./bit_reader.h"
 #include "./endian_inl.h"

@ -63,7 +64,7 @@ static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
 #if defined(WEBP_FORCE_ALIGNED)
    lbit_t in_bits;
    memcpy(&in_bits, br->buf_, sizeof(in_bits));
-#elif defined(__mips__)                        // MIPS
+#elif defined(WEBP_USE_MIPS32)
    // This is needed because of un-aligned read.
    lbit_t in_bits;
    lbit_t* p_buf_ = (lbit_t*)br->buf_;
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@ -52,7 +52,7 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
  return 1;
 }

-static void kFlush(VP8BitWriter* const bw) {
+static void Flush(VP8BitWriter* const bw) {
  const int s = 8 + bw->nb_bits_;
  const int32_t bits = bw->value_ >> s;
  assert(bw->nb_bits_ >= 0);
@ -118,7 +118,7 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob) {
    bw->range_ = kNewRange[bw->range_];
    bw->value_ <<= shift;
    bw->nb_bits_ += shift;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
  }
  return bit;
 }
@ -135,7 +135,7 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
    bw->range_ = kNewRange[bw->range_];
    bw->value_ <<= 1;
    bw->nb_bits_ += 1;
-    if (bw->nb_bits_ > 0) kFlush(bw);
+    if (bw->nb_bits_ > 0) Flush(bw);
  }
  return bit;
 }
@ -173,14 +173,14 @@ int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
 uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
  VP8PutValue(bw, 0, 9 - bw->nb_bits_);
  bw->nb_bits_ = 0;   // pad with zeroes
-  kFlush(bw);
+  Flush(bw);
  return bw->buf_;
 }

 int VP8BitWriterAppend(VP8BitWriter* const bw,
                       const uint8_t* data, size_t size) {
  assert(data != NULL);
-  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  if (bw->nb_bits_ != -8) return 0;   // Flush() must have been called
  if (!BitWriterResize(bw, size)) return 0;
  memcpy(bw->buf_ + bw->pos_, data, size);
  bw->pos_ += size;
--- a/src/utils/endian_inl.h
+++ b/src/utils/endian_inl.h
@ -16,6 +16,7 @@
 #include "../webp/config.h"
 #endif

+#include "../dsp/dsp.h"
 #include "../webp/types.h"

 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
@ -34,27 +35,15 @@
 #endif

 #if !defined(HAVE_CONFIG_H)
-#ifdef __GNUC__
-# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
-#else
-# define LOCAL_GCC_VERSION 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
-#else
-# define LOCAL_CLANG_VERSION 0
-#endif  // __clang__
-
-// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
-#if LOCAL_GCC_VERSION >= 0x403 || LOCAL_CLANG_VERSION >= 0x303
-#define HAVE_BUILTIN_BSWAP32
-#define HAVE_BUILTIN_BSWAP64
-#endif
-// clang-3.3 and gcc-4.8 have a builtin function for swap16
-#if LOCAL_GCC_VERSION >= 0x408 || LOCAL_CLANG_VERSION >= 0x303
+#if LOCAL_GCC_PREREQ(4,8) || __has_builtin(__builtin_bswap16)
 #define HAVE_BUILTIN_BSWAP16
 #endif
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+#if LOCAL_GCC_PREREQ(4,3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
 #endif  // !HAVE_CONFIG_H

 static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
@ -69,7 +58,16 @@ static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
 }

 static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
-#if defined(HAVE_BUILTIN_BSWAP32)
+#if defined(WEBP_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
  return __builtin_bswap32(x);
 #elif defined(__i386__) || defined(__x86_64__)
  uint32_t swapped_bytes;
--- a/src/utils/quant_levels_dec.c
+++ b/src/utils/quant_levels_dec.c
@ -32,10 +32,10 @@
 #define DSIZE 4          // dithering size (must be a power of two)
 // cf. http://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
- {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
- { 12,  4, 14,  6 },
- {  3, 11,  1,  9 },
- { 15,  7, 13,  5 }
+  {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
+  { 12,  4, 14,  6 },
+  {  3, 11,  1,  9 },
+  { 15,  7, 13,  5 }
 };

 #else
--- a/src/utils/rescaler.c
+++ b/src/utils/rescaler.c
@ -13,77 +13,192 @@

 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./rescaler.h"
 #include "../dsp/dsp.h"

 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow

-void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                              const uint8_t* const src, int channel) = NULL;
-void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out) = NULL;
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(WebPRescaler* const wrk,
+                                          const uint8_t* src);
+static WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+static WebPRescalerImportRowFunc WebPRescalerImportRowShrink;

-#define RFIX 30
-#define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(WebPRescaler* const wrk);
+static WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+static WebPRescalerExportRowFunc WebPRescalerExportRowShrink;

-static void ImportRowC(WebPRescaler* const wrk,
-                       const uint8_t* const src, int channel) {
+#define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
+#define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
+#define WEBP_RESCALER_FRAC(x, y) \
+    ((uint32_t)(((uint64_t)(x) << WEBP_RESCALER_RFIX) / (y)))
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+static void ImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
-  int x_in = channel;
-  int x_out;
-  int accum = 0;
-  if (!wrk->x_expand) {
-    int sum = 0;
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-      accum += wrk->x_add;
-      for (; accum > 0; accum -= wrk->x_sub) {
-        sum += src[x_in];
-        x_in += x_stride;
-      }
-      {        // Emit next horizontal pixel.
-        const int32_t base = src[x_in];
-        const int32_t frac = base * (-accum);
-        x_in += x_stride;
-        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
-        // fresh fractional start for next pixel
-        sum = (int)MULT_FIX(frac, wrk->fx_scale);
-      }
-    }
-  } else {        // simple bilinear interpolation
-    int left = src[channel], right = src[channel];
-    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    // simple bilinear interpolation
+    int accum = wrk->x_add;
+    int left = src[x_in];
+    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    x_in += x_stride;
+    while (1) {
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      x_out += x_stride;
+      if (x_out >= x_out_max) break;
+      accum -= wrk->x_sub;
      if (accum < 0) {
        left = right;
        x_in += x_stride;
+        assert(x_in < wrk->src_width * x_stride);
        right = src[x_in];
        accum += wrk->x_add;
      }
-      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
-      accum -= wrk->x_sub;
    }
-  }
-  // Accumulate the contribution of the new row.
-  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
-    wrk->irow[x_out] += wrk->frow[x_out];
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
  }
 }

-static void ExportRowC(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    for (; x_out < x_out_max; ++x_out) {
-      const int frac = (int)MULT_FIX(frow[x_out], yscale);
+static void ImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(!wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    uint32_t sum = 0;
+    int accum = 0;
+    while (x_out < x_out_max) {
+      uint32_t base = 0;
+      accum += wrk->x_add;
+      while (accum > 0) {
+        accum -= wrk->x_sub;
+        assert(x_in < wrk->src_width * x_stride);
+        base = src[x_in];
+        sum += base;
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const rescaler_t frac = base * (-accum);
+        wrk->frow[x_out] = sum * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+      x_out += x_stride;
+    }
+    assert(accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void ExportRowShrinkC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
-      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
      irow[x_out] = frac;   // new fractional start
    }
+  } else {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Main entry calls
+
+void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+  assert(!WebPRescalerInputDone(wrk));
+  if (!wrk->x_expand) {
+    WebPRescalerImportRowShrink(wrk, src);
+  } else {
+    WebPRescalerImportRowExpand(wrk, src);
+  }
+}
+
+void WebPRescalerExportRow(WebPRescaler* const wrk) {
+  if (wrk->y_accum <= 0) {
+    assert(!WebPRescalerOutputDone(wrk));
+    if (wrk->y_expand) {
+      WebPRescalerExportRowExpand(wrk);
+    } else if (wrk->fxy_scale) {
+      WebPRescalerExportRowShrink(wrk);
+    } else {  // very special case for src = dst = 1x1
+      int i;
+      assert(wrk->src_width == 1 && wrk->dst_width <= 2);
+      assert(wrk->src_height == 1 && wrk->dst_height == 1);
+      for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
+        wrk->dst[i] = wrk->irow[i];
+        wrk->irow[i] = 0;
+      }
+    }
    wrk->y_accum += wrk->y_add;
    wrk->dst += wrk->dst_stride;
+    ++wrk->dst_y;
  }
 }

@ -92,23 +207,25 @@ static void ExportRowC(WebPRescaler* const wrk, int x_out) {

 #if defined(WEBP_USE_MIPS32)

-static void ImportRowMIPS(WebPRescaler* const wrk,
-                          const uint8_t* const src, int channel) {
+static void ImportRowShrinkMIPS(WebPRescaler* const wrk, const uint8_t* src) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  const int fx_scale = wrk->fx_scale;
  const int x_add = wrk->x_add;
  const int x_sub = wrk->x_sub;
-  int* frow = wrk->frow + channel;
-  int* irow = wrk->irow + channel;
-  const uint8_t* src1 = src + channel;
-  int temp1, temp2, temp3;
-  int base, frac, sum;
-  int accum, accum1;
  const int x_stride1 = x_stride << 2;
-  int loop_c = x_out_max - channel;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;

-  if (!wrk->x_expand) {
    __asm__ volatile (
      "li     %[temp1],   0x8000                    \n\t"
      "li     %[temp2],   0x10000                   \n\t"
@ -116,179 +233,295 @@ static void ImportRowMIPS(WebPRescaler* const wrk,
      "li     %[accum],   0                         \n\t"
    "1:                                             \n\t"
      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
      "blez   %[accum],   3f                        \n\t"
    "2:                                             \n\t"
-      "lbu    %[temp3],   0(%[src1])                \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
-      "addu   %[sum],     %[sum],     %[temp3]      \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
      "bgtz   %[accum],   2b                        \n\t"
    "3:                                             \n\t"
-      "lbu    %[base],    0(%[src1])                \n\t"
-      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
      "negu   %[accum1],  %[accum]                  \n\t"
      "mul    %[frac],    %[base],    %[accum1]     \n\t"
-      "addu   %[temp3],   %[sum],     %[base]       \n\t"
-      "mul    %[temp3],   %[temp3],   %[x_sub]      \n\t"
-      "lw     %[base],    0(%[irow])                \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "sll    %[accum1],  %[frac],    2             \n\t"
      "mult   %[temp1],   %[temp2]                  \n\t"
-      "madd   %[accum1],  %[fx_scale]               \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
      "mfhi   %[sum]                                \n\t"
      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
      "sw     %[temp3],   0(%[frow])                \n\t"
-      "add    %[base],    %[base],    %[temp3]      \n\t"
-      "sw     %[base],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[irow],    %[x_stride1]  \n\t"
      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}

-      : [accum] "=&r" (accum), [src1] "+r" (src1), [temp3] "=&r" (temp3),
-        [sum] "=&r" (sum), [base] "=&r" (base), [frac] "=&r" (frac),
-        [frow] "+r" (frow), [irow] "+r" (irow), [accum1] "=&r" (accum1),
-        [temp2] "=&r" (temp2), [temp1] "=&r" (temp1)
-      : [x_stride] "r" (x_stride), [fx_scale] "r" (fx_scale),
-        [x_sub] "r" (x_sub), [x_add] "r" (x_add),
-        [loop_c] "r" (loop_c), [x_stride1] "r" (x_stride1)
+static void ImportRowExpandMIPS(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpandMIPS(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+static void ExportRowShrinkMIPS(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
      : "memory", "hi", "lo"
    );
  } else {
    __asm__ volatile (
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "li     %[accum],   0                         \n\t"
-    "1:                                             \n\t"
-      "bgez   %[accum],   2f                        \n\t"
-      "move   %[temp2],   %[temp1]                  \n\t"
-      "addu   %[src1],    %[x_stride]               \n\t"
-      "lbu    %[temp1],   0(%[src1])                \n\t"
-      "addu   %[accum],   %[x_add]                  \n\t"
-    "2:                                             \n\t"
-      "subu   %[temp3],   %[temp2],   %[temp1]      \n\t"
-      "mul    %[temp3],   %[temp3],   %[accum]      \n\t"
-      "mul    %[base],    %[temp1],   %[x_add]      \n\t"
-      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
-      "lw     %[frac],    0(%[irow])                \n\t"
-      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
-      "addu   %[temp3],   %[base],    %[temp3]      \n\t"
-      "sw     %[temp3],   0(%[frow])                \n\t"
-      "addu   %[frow],    %[x_stride1]              \n\t"
-      "addu   %[frac],    %[temp3]                  \n\t"
-      "sw     %[frac],    0(%[irow])                \n\t"
-      "addu   %[irow],    %[x_stride1]              \n\t"
-      "bgtz   %[loop_c],  1b                        \n\t"
-
-      : [src1] "+r" (src1), [accum] "=&r" (accum), [temp1] "=&r" (temp1),
-        [temp2] "=&r" (temp2), [temp3] "=&r" (temp3), [base] "=&r" (base),
-        [frac] "=&r" (frac), [frow] "+r" (frow), [irow] "+r" (irow)
-      : [x_stride] "r" (x_stride), [x_add] "r" (x_add), [x_sub] "r" (x_sub),
-        [x_stride1] "r" (x_stride1), [loop_c] "r" (loop_c)
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
      : "memory", "hi", "lo"
    );
  }
 }

-static void ExportRowMIPS(WebPRescaler* const wrk, int x_out) {
-  if (wrk->y_accum <= 0) {
-    uint8_t* const dst = wrk->dst;
-    int32_t* const irow = wrk->irow;
-    const int32_t* const frow = wrk->frow;
-    const int yscale = wrk->fy_scale * (-wrk->y_accum);
-    const int x_out_max = wrk->dst_width * wrk->num_channels;
-    // if wrk->fxy_scale can fit into 32 bits use optimized code,
-    // otherwise use C code
-    if ((wrk->fxy_scale >> 32) == 0) {
-      int temp0, temp1, temp3, temp4, temp5, temp6, temp7, loop_end;
-      const int temp2 = (int)(wrk->fxy_scale);
-      const int temp8 = x_out_max << 2;
-      uint8_t* dst_t = (uint8_t*)dst;
-      int32_t* irow_t = (int32_t*)irow;
-      const int32_t* frow_t = (const int32_t*)frow;
-
-      __asm__ volatile(
-        "addiu    %[temp6],    $zero,       -256          \n\t"
-        "addiu    %[temp7],    $zero,       255           \n\t"
-        "li       %[temp3],    0x10000                    \n\t"
-        "li       %[temp4],    0x8000                     \n\t"
-        "addu     %[loop_end], %[frow_t],   %[temp8]      \n\t"
-      "1:                                                 \n\t"
-        "lw       %[temp0],    0(%[frow_t])               \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "addiu    %[frow_t],   %[frow_t],   4             \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[yscale]                  \n\t"
-        "mfhi     %[temp1]                                \n\t"
-        "lw       %[temp0],    0(%[irow_t])               \n\t"
-        "addiu    %[dst_t],    %[dst_t],    1             \n\t"
-        "addiu    %[irow_t],   %[irow_t],   4             \n\t"
-        "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
-        "mult     %[temp3],    %[temp4]                   \n\t"
-        "sll      %[temp0],    %[temp0],    2             \n\t"
-        "madd     %[temp0],    %[temp2]                   \n\t"
-        "mfhi     %[temp5]                                \n\t"
-        "sw       %[temp1],    -4(%[irow_t])              \n\t"
-        "and      %[temp0],    %[temp5],    %[temp6]      \n\t"
-        "slti     %[temp1],    %[temp5],    0             \n\t"
-        "beqz     %[temp0],    2f                         \n\t"
-        "xor      %[temp5],    %[temp5],    %[temp5]      \n\t"
-        "movz     %[temp5],    %[temp7],    %[temp1]      \n\t"
-      "2:                                                 \n\t"
-        "sb       %[temp5],    -1(%[dst_t])               \n\t"
-        "bne      %[frow_t],   %[loop_end], 1b            \n\t"
-
-        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
-          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
-          [temp7]"=&r"(temp7), [frow_t]"+r"(frow_t), [irow_t]"+r"(irow_t),
-          [dst_t]"+r"(dst_t), [loop_end]"=&r"(loop_end)
-        : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp8]"r"(temp8)
-        : "memory", "hi", "lo"
-      );
-      wrk->y_accum += wrk->y_add;
-      wrk->dst += wrk->dst_stride;
-    } else {
-      ExportRowC(wrk, x_out);
-    }
-  }
-}
 #endif   // WEBP_USE_MIPS32

 //------------------------------------------------------------------------------

 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
-                      uint8_t* const dst, int dst_width, int dst_height,
-                      int dst_stride, int num_channels, int x_add, int x_sub,
-                      int y_add, int y_sub, int32_t* const work) {
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels, rescaler_t* const work) {
+  const int x_add = src_width, x_sub = dst_width;
+  const int y_add = src_height, y_sub = dst_height;
  wrk->x_expand = (src_width < dst_width);
+  wrk->y_expand = (src_height < dst_height);
  wrk->src_width = src_width;
  wrk->src_height = src_height;
  wrk->dst_width = dst_width;
  wrk->dst_height = dst_height;
+  wrk->src_y = 0;
+  wrk->dst_y = 0;
  wrk->dst = dst;
  wrk->dst_stride = dst_stride;
  wrk->num_channels = num_channels;
+
  // for 'x_expand', we use bilinear interpolation
-  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add;
  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
-  wrk->y_accum = y_add;
-  wrk->y_add = y_add;
-  wrk->y_sub = y_sub;
-  wrk->fx_scale = (1 << RFIX) / x_sub;
-  wrk->fy_scale = (1 << RFIX) / y_sub;
-  wrk->fxy_scale = wrk->x_expand ?
-      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
-      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  if (!wrk->x_expand) {  // fx_scale is not used otherwise
+    wrk->fx_scale = WEBP_RESCALER_FRAC(1, wrk->x_sub);
+  }
+  // vertical scaling parameters
+  wrk->y_add = wrk->y_expand ? y_add - 1 : y_add;
+  wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
+  wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
+  if (!wrk->y_expand) {
+    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    const uint64_t ratio =
+        (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
+    if (ratio != (uint32_t)ratio) {
+      // We can't represent the ratio with the current fixed-point precision.
+      // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
+      wrk->fxy_scale = 0;
+    } else {
+      wrk->fxy_scale = (uint32_t)ratio;
+    }
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->y_sub);
+  } else {
+    wrk->fy_scale = WEBP_RESCALER_FRAC(1, wrk->x_add);
+    // wrk->fxy_scale is unused here.
+  }
  wrk->irow = work;
  wrk->frow = work + num_channels * dst_width;
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));

-  if (WebPRescalerImportRow == NULL) {
-    WebPRescalerImportRow = ImportRowC;
-    WebPRescalerExportRow = ExportRowC;
+  if (WebPRescalerImportRowExpand == NULL) {
+    WebPRescalerImportRowExpand = ImportRowExpandC;
+    WebPRescalerImportRowShrink = ImportRowShrinkC;
+    WebPRescalerExportRowExpand = ExportRowExpandC;
+    WebPRescalerExportRowShrink = ExportRowShrinkC;
    if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS32)
      if (VP8GetCPUInfo(kMIPS32)) {
-        WebPRescalerImportRow = ImportRowMIPS;
-        WebPRescalerExportRow = ExportRowMIPS;
+        WebPRescalerImportRowExpand = ImportRowExpandMIPS;
+        WebPRescalerImportRowShrink = ImportRowShrinkMIPS;
+        WebPRescalerExportRowExpand = ExportRowExpandMIPS;
+        WebPRescalerExportRowShrink = ExportRowShrinkMIPS;
      }
 #endif
    }
@ -296,7 +529,10 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
 }

 #undef MULT_FIX
-#undef RFIX
+#undef WEBP_RESCALER_RFIX
+#undef WEBP_RESCALER_ONE
+#undef WEBP_RESCALER_FRAC
+#undef ROUNDER

 //------------------------------------------------------------------------------
 // all-in-one calls
@ -309,11 +545,20 @@ int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                       const uint8_t* src, int src_stride) {
  int total_imported = 0;
-  while (total_imported < num_lines && wrk->y_accum > 0) {
-    int channel;
-    for (channel = 0; channel < wrk->num_channels; ++channel) {
-      WebPRescalerImportRow(wrk, src, channel);
+  while (total_imported < num_lines && !WebPRescalerHasPendingOutput(wrk)) {
+    if (wrk->y_expand) {
+      rescaler_t* const tmp = wrk->irow;
+      wrk->irow = wrk->frow;
+      wrk->frow = tmp;
    }
+    WebPRescalerImportRow(wrk, src);
+    if (!wrk->y_expand) {     // Accumulate the contribution of the new row.
+      int x;
+      for (x = 0; x < wrk->num_channels * wrk->dst_width; ++x) {
+        wrk->irow[x] += wrk->frow[x];
+      }
+    }
+    ++wrk->src_y;
    src += src_stride;
    ++total_imported;
    wrk->y_accum -= wrk->y_sub;
@ -324,7 +569,7 @@ int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
 int WebPRescalerExport(WebPRescaler* const rescaler) {
  int total_exported = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
-    WebPRescalerExportRow(rescaler, 0);
+    WebPRescalerExportRow(rescaler);
    ++total_exported;
  }
  return total_exported;
--- a/src/utils/rescaler.h
+++ b/src/utils/rescaler.h
@ -21,20 +21,23 @@ extern "C" {
 #include "../webp/types.h"

 // Structure used for on-the-fly rescaling
+typedef uint32_t rescaler_t;   // type for side-buffer
 typedef struct {
  int x_expand;               // true if we're expanding in the x direction
+  int y_expand;               // true if we're expanding in the y direction
  int num_channels;           // bytes to jump between pixels
-  int fy_scale, fx_scale;     // fixed-point scaling factor
-  int64_t fxy_scale;          // ''
-  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  uint32_t fx_scale;          // fixed-point scaling factors
+  uint32_t fy_scale;          // ''
+  uint32_t fxy_scale;         // ''
  int y_accum;                // vertical accumulator
-  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
-  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int y_add, y_sub;           // vertical increments
+  int x_add, x_sub;           // horizontal increments
  int src_width, src_height;  // source dimensions
  int dst_width, dst_height;  // destination dimensions
+  int src_y, dst_y;           // row counters for input and output
  uint8_t* dst;
  int dst_stride;
-  int32_t* irow, *frow;       // work buffer
+  rescaler_t* irow, *frow;    // work buffer
 } WebPRescaler;

 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
@ -43,9 +46,7 @@ void WebPRescalerInit(WebPRescaler* const rescaler,
                      uint8_t* const dst,
                      int dst_width, int dst_height, int dst_stride,
                      int num_channels,
-                      int x_add, int x_sub,
-                      int y_add, int y_sub,
-                      int32_t* const work);
+                      rescaler_t* const work);

 // Returns the number of input lines needed next to produce one output line,
 // considering that the maximum available input lines are 'max_num_lines'.
@ -57,21 +58,29 @@ int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                       const uint8_t* src, int src_stride);

-// Import a row of data and save its contribution in the rescaler.
-// 'channel' denotes the channel number to be imported.
-extern void (*WebPRescalerImportRow)(WebPRescaler* const wrk,
-                                     const uint8_t* const src, int channel);
-// Export one row (starting at x_out position) from rescaler.
-extern void (*WebPRescalerExportRow)(WebPRescaler* const wrk, int x_out);
-
-// Return true if there is pending output rows ready.
-static WEBP_INLINE
-int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
-  return (rescaler->y_accum <= 0);
-}
-
 // Export as many rows as possible. Return the numbers of rows written.
 int WebPRescalerExport(WebPRescaler* const rescaler);
+void WebPRescalerImportRow(WebPRescaler* const wrk,
+                           const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+void WebPRescalerExportRow(WebPRescaler* const wrk);
+
+// Return true if input is finished
+static WEBP_INLINE
+int WebPRescalerInputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->src_y >= rescaler->src_height);
+}
+// Return true if output is finished
+static WEBP_INLINE
+int WebPRescalerOutputDone(const WebPRescaler* const rescaler) {
+  return (rescaler->dst_y >= rescaler->dst_height);
+}
+
+// Return true if there are pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return !WebPRescalerOutputDone(rescaler) && (rescaler->y_accum <= 0);
+}

 //------------------------------------------------------------------------------

--- a/src/utils/utils.c
+++ b/src/utils/utils.c
@ -155,9 +155,9 @@ static void SubMem(void* ptr) {
 }

 #else
-#define Increment(v) do {} while(0)
-#define AddMem(p, s) do {} while(0)
-#define SubMem(p)    do {} while(0)
+#define Increment(v) do {} while (0)
+#define AddMem(p, s) do {} while (0)
+#define SubMem(p)    do {} while (0)
 #endif

 // Returns 0 in case of overflow of nmemb * size.
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@ -90,7 +90,7 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
 #pragma intrinsic(_BitScanReverse)

 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  uint32_t first_set_bit;
+  unsigned long first_set_bit;
  _BitScanReverse(&first_set_bit, n);
  return first_set_bit;
 }
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@ -444,16 +444,20 @@ struct WebPDecoderOptions {
  int dithering_strength;             // dithering strength (0=Off, 100=full)
 #if WEBP_DECODER_ABI_VERSION > 0x0203
  int flip;                           // flip output vertically
+#endif
+#if WEBP_DECODER_ABI_VERSION > 0x0204
  int alpha_dithering_strength;       // alpha dithering strength in [0..100]
 #endif

  // Unused for now:
  int force_rotation;                 // forced rotation (to be applied _last_)
  int no_enhancement;                 // if true, discard enhancement layer
-#if WEBP_DECODER_ABI_VERSION > 0x0203
-  uint32_t pad[3];                    // padding for later use
-#else
+#if WEBP_DECODER_ABI_VERSION < 0x0203
  uint32_t pad[5];                    // padding for later use
+#elif WEBP_DECODER_ABI_VERSION < 0x0204
+  uint32_t pad[4];                    // padding for later use
+#else
+  uint32_t pad[3];                    // padding for later use
 #endif
 };

--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@ -231,14 +231,14 @@ struct WebPMemoryWriter {
 // The following must be called first before any use.
 WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);

-#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
 // The following must be called to deallocate writer->mem memory. The 'writer'
 // object itself is not deallocated.
 WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer);
 #endif
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#if WEBP_ENCODER_ABI_VERSION > 0x0203
 // writer.mem must be freed by calling WebPMemoryWriterClear.
 #else
 // writer.mem must be freed by calling 'free(writer.mem)'.
@ -419,7 +419,9 @@ WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
 WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);

 // Rescale a picture to new dimension width x height.
-// Now gamma correction is applied.
+// If either 'width' or 'height' (but not both) is 0 the corresponding
+// dimension will be calculated preserving the aspect ratio.
+// No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
 WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);

@ -446,13 +448,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRA(
 WEBP_EXTERN(int) WebPPictureImportBGRX(
    WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);

-// Converts picture->argb data to the YUVA format specified by 'colorspace'.
+// Converts picture->argb data to the YUV420A format. The 'colorspace'
+// parameter is deprecated and should be equal to WEBP_YUV420.
 // Upon return, picture->use_argb is set to false. The presence of real
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
-                                       WebPEncCSP colorspace);
+                                       WebPEncCSP /*colorspace = WEBP_YUV420*/);

 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
@ -461,6 +464,15 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
 WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
    WebPPicture* picture, WebPEncCSP colorspace, float dithering);

+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+// Performs 'smart' RGBA->YUVA420 downsampling and colorspace conversion.
+// Downsampling is handled with extra care in case of color clipping. This
+// method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
+// YUV representation.
+// Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+#endif
+
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A.
 // Note that the use of this method is discouraged if one has access to the
--- a/src/webp/types.h
+++ b/src/webp/types.h
@ -18,10 +18,11 @@

 #ifndef _MSC_VER
 #include <inttypes.h>
-#ifdef __STRICT_ANSI__
-#define WEBP_INLINE
-#else  /* __STRICT_ANSI__ */
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define WEBP_INLINE inline
+#else
+#define WEBP_INLINE
 #endif
 #else
 typedef signed   char int8_t;