libwebp.jar: build w/Java 1.6 for Android compat

broken since: a5c297c swig/java: reduce wrapper function code duplication this was a part of v0.3.1, but not v0.3.0. Change-Id: I001d4bd0a7a1aa1b2d267bc63bc1d8226bff00c1 (cherry picked from commit de899516c7)
fix memleak in WebPIDelete()
2013-10-16 19:17:16 +02:00 · 2013-10-16 19:16:53 +02:00
134 changed files with 3390 additions and 7366 deletions
--- a/.mailmap
+++ b/.mailmap
@@ -1,7 +1,6 @@
 <johann.koenig@duck.com> <johannkoenig@google.com>
 Mikołaj Zalewski <mikolajz@google.com>
 Pascal Massimino <pascal.massimino@gmail.com>
-<pascal.massimino@gmail.com> <skal@google.com>
 Vikas Arora <vikasa@google.com>
 <vikasa@google.com> <vikasa@gmail.com>
 <vikasa@google.com> <vikaas.arora@gmail.com>
--- a/1
+++ b/1
@@ -1,5 +1,4 @@
 Contributors:
- Charles Munger (clm at google dot com)
 - Christian Duvivier (cduvivier at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
--- a/Android.mk
+++ b/Android.mk
@@ -1,14 +1,6 @@
-LOCAL_PATH := $(call my-dir)
-
-WEBP_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD
-
-ifeq ($(APP_OPTIM),release)
-  WEBP_CFLAGS += -finline-functions -frename-registers -ffast-math -s \
-                 -ffunction-sections -fdata-sections
-endif
+LOCAL_PATH:= $(call my-dir)

 include $(CLEAR_VARS)
-
 LOCAL_SRC_FILES := \
    src/dec/alpha.c \
    src/dec/buffer.c \
@@ -47,7 +39,6 @@ LOCAL_SRC_FILES := \
    src/enc/tree.c \
    src/enc/vp8l.c \
    src/enc/webpenc.c \
-    src/utils/alpha_processing.c \
    src/utils/bit_reader.c \
    src/utils/bit_writer.c \
    src/utils/color_cache.c \
@@ -56,16 +47,16 @@ LOCAL_SRC_FILES := \
    src/utils/huffman_encode.c \
    src/utils/quant_levels.c \
    src/utils/quant_levels_dec.c \
-    src/utils/random.c \
    src/utils/rescaler.c \
    src/utils/thread.c \
    src/utils/utils.c \

-LOCAL_CFLAGS := $(WEBP_CFLAGS)
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/src
+LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
+                -DWEBP_USE_THREAD \
+                -finline-functions -frename-registers -ffast-math \
+                -s -fomit-frame-pointer -Isrc/webp

-# prefer arm over thumb mode for performance gains
-LOCAL_ARM_MODE := arm
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/src

 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
  # Setting LOCAL_ARM_NEON will enable -mfpu=neon which may cause illegal
@@ -77,22 +68,8 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 endif
 LOCAL_STATIC_LIBRARIES := cpufeatures

-LOCAL_MODULE := webp
+LOCAL_MODULE:= webp

 include $(BUILD_STATIC_LIBRARY)

-include $(CLEAR_VARS)
-
-LOCAL_SRC_FILES := \
-    examples/dwebp.c \
-    examples/example_util.c \
-
-LOCAL_CFLAGS := $(WEBP_CFLAGS)
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/src
-LOCAL_STATIC_LIBRARIES := webp
-
-LOCAL_MODULE := dwebp
-
-include $(BUILD_EXECUTABLE)
-
 $(call import-module,android/cpufeatures)
--- a/396
+++ b/396
@@ -1,281 +1,11 @@
-256e433 update NEWS description with new general features
-2962534 Merge "gif2webp: don't use C99 %zu" into 0.4.0
-3b9f9dd gif2webp: don't use C99 %zu
-b5b2e3c cwebp: fix metadata output w/lossy+alpha
-ad26df1 makefile.unix: clean up libgif2webp_util.a
-c3b4557 update Changelog
-ca84112 Merge "bump version to 0.4.0" into 0.4.0
-8c524db bump version to 0.4.0
-eec2398 update AUTHORS & .mailmap
-b9bbf6a update NEWS for 0.4.0
-c72e081 Merge "dec/webp.c: don't wait for data before reporting w/h"
-5ad6531 dec/frame.c: fix formatting
-f7fc4bc dec/webp.c: don't wait for data before reporting w/h
-66a32af Merge "NEON speed up"
-26d842e NEON speed up
-f307f98 Merge "webpmux: let -- stop parameter parsing"
-fe051da Merge "README: add a section on gif2webp"
-6fd2bd6 Merge "manpage pedantry"
-4af1900 README: add a section on gif2webp
-6f36ade manpage pedantry
-f9016cb README: update dwebp options
-b4fa0a4 webpmux: let -- stop parameter parsing
-a9a20ac gif2webp: Add a multi-threaded encode option
-495bef4 fix bug in TrellisQuantize
-605a712 simplify __cplusplus ifdef
-33109f9 Merge "drop: ifdef __cplusplus checks from C files"
-7f9de0b Merge changes I994a5587,I8467bb71,I13b50688,I1e2c9c7b
-5459030 gif2webp: let -- stop parameter parsing
-a4b0aa0 vwebp: let -- stop parameter parsing
-98af68f cwebp: let -- stop parameter parsing
-a33831e dwebp: let -- stop parameter parsing
-3630124 add some checks on error paths
-ce4c713 Merge "autoconf: add --disable-wic"
-5227d99 drop: ifdef __cplusplus checks from C files
-f645355 dwebp.1: fix typo
-f91034f Merge "cwebp: print metadata stats when no output file is given"
-d493455 gif2webp: Backward compatibility for giflib version <= 4.1.3
-4c617d3 gif2webp: Disable output of ICC profile by default
-73b731f introduce a special quantization function for WHT
-41c0cc4 Make Forward WHT transform use 32bit fixed-point calculation
-a3359f5 Only compute quantization params once
-7049043 cwebp: print metadata stats when no output file is given
-d513bb6 * fix off-by-one zthresh calculation * remove the sharpening for non luma-AC coeffs * adjust the bias a little bit to compensate for this
-ad9dec0 Merge "cosmetics: dwebp: fix local function name format"
-f737f03 Merge "dwebp: remove a dead store"
-3c3a70d Merge "makefile.unix: install binaries in $(DESTDIR)/bin/"
-150b655 Merge "Android.mk: add some release compile flags"
-dbebd33 cosmetics: dwebp: fix local function name format
-2774995 dwebp: remove a dead store
-a01e04f autoconf: add --disable-wic
-5009b22 makefile.unix: install binaries in $(DESTDIR)/bin/
-bab30fc Merge "fix -print_psnr / ssim options"
-ebef7fb fix -print_psnr / ssim options
-cb63785 Merge "fix bug due to overzealous check in WebPPictureYUVAToARGB()"
-8189885 Merge "EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE"
-4ad7d33 Android.mk: add some release compile flags
-c12e236 cosmetics: fix a few typos
-6f10403 fix bug due to overzealous check in WebPPictureYUVAToARGB()
-3f6c35c EstimateBestFilter: use an int to iterate WEBP_FILTER_TYPE
-cc55790 Merge changes I8bb7a4dc,I2c180051,I021a014f,I8a224a62
-c536afb Merge "cosmetics: fix some typos"
-cbdd3e6 add a -dither dithering option to the decoder
-e812401 Updated iosbuild.sh for XCode 5.x
-4931c32 cosmetics: fix some typos
-05aacf7 mux: add some missing casts
-617d934 enc/vp8l: add a missing cast
-46db286 idec: add some missing casts
-b524e33 ErrorStatusLossless: correct return type
-cb261f7 fix a descaling bug for vertical/horizontal U/V interpolation
-bcb3955 Merge changes I48968468,I181bc736
-73f5213 gif2webp: Add a mixed compression mode
-6198715 demux: split chunk parsing from ParseVP8X
-d2e3f4e demux: add a tail pointer for chunks
-87cffcc demux: cosmetics: s/has_frames/is_animation/
-e18e667 demux: strictly enforce the animation flag
-c4f39f4 demux: cosmetics: remove a useless break
-61cb884 demux: (non-exp) fail if the fragmented flag is set
-ff379db few % speedup of lossless encoding
-df3649a remove all disabled code related to P-frames
-6d0cb3d Merge "gif2webp: kmin = 0 should suppress key-frame addition."
-3655598 gif2webp: kmin = 0 should suppress key-frame addition.
-7708e60 Merge "detect flatness in blocks and favor DC prediction"
-06b1503 Merge "add comment about the kLevelsFromDelta[][] LUT generation"
-5935259 add comment about the kLevelsFromDelta[][] LUT generation
-e3312ea detect flatness in blocks and favor DC prediction
-ebc9b1e Merge "VPLBitReader bugfix: Catch error if bit_pos > LBITS too."
-96ad0e0 VPLBitReader bugfix: Catch error if bit_pos > LBITS too.
-a014e9c tune quantization biases toward higher precision
-1e89861 add helpful PrintBlockInfo() function
-596a6d7 make use of 'extern' consistent in function declarations
-c8d48c6 Merge "extract random utils to their own file util/random.[ch]"
-98aa33c extract random utils to their own file util/random.[ch]
-432a723 Merge "swig: add basic go bindings"
-fab618b Merge "rename libwebp.i -> libwebp.swig"
-e4e7fcd swig: add basic go bindings
-d340872 Merge "fast auto-determined filtering strength"
-f8bfd5c fast auto-determined filtering strength
-ac0bf95 small clean-up in ExpandMatrix()
-1939607 rename libwebp.i -> libwebp.swig
-43148b6 filtering: precompute ilimit and hev_threshold
-18f992e simplify f_inner calculation a little
-241d11f add missing const
-86c0031 add a 'format' field to WebPBitstreamFeatures
-dde91fd Demux: Correct the extended format validation
-5d6c5bd add entry for '-resize' option in cwebp's man
-7c098d1 Use some gamma-curve range compression when computing U/V average
-0b2b050 Use deterministic random-dithering during RGB->YUV conversion
-8a2fa09 Add a second multi-thread method
-7d6f2da Merge "up to 20% faster multi-threaded decoding"
-266f63e Merge "libwebp.jar: build w/Java 1.6 for Android compat"
-0532149 up to 20% faster multi-threaded decoding
-38efdc2 Simplify the gif2webp tool: move the optimization details to util
-de89951 libwebp.jar: build w/Java 1.6 for Android compat
-cb22155 Decode a full row of bitstream before reconstructing
-dca8a4d Merge "NEON/simple loopfilter: avoid q4-q7 registers"
-9e84d90 Merge "NEON/TransformWHT: avoid q4-q7 registers"
-fc10249 NEON/simple loopfilter: avoid q4-q7 registers
-2f09d63 NEON/TransformWHT: avoid q4-q7 registers
-77585a2 Merge "use a macrofunc for setting NzCoeffs bits"
-d155507 Merge "use HINT_GRAPH as image_hint for gif source"
-9c56164 Merge "only print GIF_DISPOSE_WARNING once"
-0587986 use HINT_GRAPH as image_hint for gif source
-0b28d7a use a macrofunc for setting NzCoeffs bits
-f9bbc2a Special-case sparse transform
-0012519 gif2webp: detect and flatten uniformly similar blocks
-0deaf0f only print GIF_DISPOSE_WARNING once
-6a8c0eb Merge "small optimization in segment-smoothing loop"
-f7146bc small optimization in segment-smoothing loop
-5a7533c small gif2webp fix
-4df0c89 Merge changes Ic697660c,I27285521
-5b2e6bd Android.mk: add a dwebp target
-f910a84 Android.mk: update build flags
-63f9aba special-case WHT transform when there's only DC
-80911ae Merge "7-8% faster decoding by rewriting GetCoeffs()"
-606c430 gif2webp: Improved compression for lossy animated WebP
-fb887f7 gif2webp: Different kmin/kmax defaults for lossy and lossless
-2a98136 7-8% faster decoding by rewriting GetCoeffs()
-92d47e4 improve VP8L signature detection by checking the version bits too
-5cd43e4 Add -incremental option to dwebp
-54b8e3f webpmux: DisplayInfo(): remove unnecessary error checks.
-40ae352 fix memleak in WebPIDelete()
-d966265 mux.h doc: WebPMuxGetFrame() can return WEBP_MUX_MEMORY_ERROR too.
-0e6747f webpmux -info: display dimensions and has_alpha per frame
-d78a82c Sanity check for underflow
-8498f4b Merge "remove -Wshadow warnings"
-e89c6fc Avoid a potential memleak
-3ebe175 Merge "break down the proba 4D-array into some handy structs"
-6a44550 break down the proba 4D-array into some handy structs
-2f5e893 remove -Wshadow warnings
-bf3a29b Merge "add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags"
-2b0a759 Merge "fix some warnings from static analysis"
-22dd07c mux.h: Some doc corrections
-79ff034 add proper WEBP_HAVE_GIF and WEBP_HAVE_GL flags
-d51f45f fix some warnings from static analysis
-d134307 fix conversion warning on MSVC
-d538cea gif2webp: Support a 'min' and 'max'  key frame interval
-80b54e1 allow search with token buffer loop and fix PARTITION0 problem
-b7d4e04 add VP8EstimateTokenSize()
-10fddf5 enc/quant.c: silence a warning
-399cd45 Merge "fix compile error on ARM/gcc"
-9f24519 encoder: misc rate-related fixes
-c663bb2 Merge "simplify VP8IteratorSaveBoundary() arg passing"
-fa46b31 Demux.h: Correct a method name reference
-f8398c9 fix compile error on ARM/gcc
-f691f0e simplify VP8IteratorSaveBoundary() arg passing
-42542be up to 6% faster encoding with clang compiler
-93402f0 multi-threaded segment analysis
-7e2d659 Merge "remove the PACK() bit-packing tricks"
-c13fecf remove the PACK() bit-packing tricks
-2fd091c Merge "use NULL for lf_stats_ testing, not bool"
-b11c9d6 dwebp: use default dct_method
-4bb8465 Merge "(de)mux.h: wrap pseudo-code in /* */"
-cfb56b1 make -pass option work with token buffers
-5416aab (de)mux.h: wrap pseudo-code in /* */
-35dba33 use NULL for lf_stats_ testing, not bool
-733a7fa enc->Iterator memory cleanup
-e81fac8 Add support for "no blend" in webpmux binary
-3b80bc4 gif2webp: Separate out each step into a method
-bef7e9c Add doc precision about demux object keeping pointers to data.
-61405a1 dwebp: enable stdout output with WIC
-6eabb88 Merge "Animated WebP: add "do no blend" option to spec"
-be20dec fix compilation for BITS 24
-e58cc13 Merge "dwebp: s/unsigned char/uint8_t/"
-72501d4 dwebp: s/unsigned char/uint8_t/
-2c9633e Merge "gif2webp: Insert independent frames at regular intervals."
-f0d6a14 gif2webp: Insert independent frames at regular intervals.
-b25a6fb yuv.h: fix indent
-ede3602 Merge "cosmetics: fix indent"
-3a65122 dwebp: fix stdout related output
-388a724 cosmetics: fix indent
-4c7322c Merge "dsp: msvc compatibility"
-d50c7e3 Merge "5-7% faster SSE2 versions of YUV->RGB conversion functions"
-b8ab784 Merge "simplify upsampler calls: only allow 'bottom' to be NULL"
-df6cebf 5-7% faster SSE2 versions of YUV->RGB conversion functions
-ad6ac32 simplify upsampler calls: only allow 'bottom' to be NULL
-a5e8afa output to stdout if file name is "-"
-f358450 dsp: msvc compatibility
-43a7c8e Merge "cosmetics"
-4c5f19c Merge "bit_reader.h: cosmetics"
-f72fab7 cosmetics
-14dd5e7 fix const-ness
-b20aec4 Merge "Support for 'do not blend' option in vwebp"
-dcf6522 Support for 'do not blend' option in vwebp
-d5bad03 Animated WebP: add "do no blend" option to spec
-a2f5f73 Merge "Support for "Do not blend" in mux and demux libraries"
-e081f2f Pack code & extra_bits to Struct (VP8LPrefixCode).
-6284854 Support for "Do not blend" in mux and demux libraries
-f486aaa Merge "slightly faster ParseIntraMode"
-d171863 slightly faster ParseIntraMode
-3ceca8a bit_reader.h: cosmetics
-69257f7 Create LUT for PrefixEncode.
-988b708 add WebPWorkerExecute() for convenient bypass
-06e2498 Merge "VP8EncIterator clean-up"
-de4d4ad VP8EncIterator clean-up
-7bbe952 Merge "cosmetics: thread.c: drop a redundant comment"
-da41148 cosmetics: thread.c: drop a redundant comment
-feb4b6e thread.h: #ifdef when checking WEBP_USE_THREAD
-8924a3a thread.c: drop WebPWorker prefix from static funcs
-1aed8f2 Merge "fix indent"
-4038ed1 fix indent
-1693fd9 Demux: A new state WEBP_DEMUX_PARSE_ERROR
-8dcae8b fix rescaling-with-alpha inaccuracy
-11249ab Merge changes I9b4dc36c,I4e0eef4d
-52508a1 Mux: support parsing unknown chunks within a frame/fragment.
-05db057 WebPMuxSetChunk: remove unused variable
-8ba1bf6 Stricter check for presence of alpha when writing lossless images
-a03c351 Demux: WebPIterator now also denotes if the frame has alpha.
-6df743a Decoder: handle fragments case correctly too.
-faa4b07 Support for unknown chunks in mux library
-7d60bbc Speed up HashChainFindCopy function.
-6674014 Speedup Alpha plane encoding.
-b7346a1 0.1 % speedup to decoding
-c606182 webp-container-spec: Tighten language added by last
-a34a502 pngdec: output error messages from libpng
-e84c625 Merge "Detect canvas and image size mismatch in decoder."
-f626fe2 Detect canvas and image size mismatch in decoder.
-f5fbdee demux: stricter image bounds check
-30c8158 add extra assert in Huffman decode code
-8967b9f SSE2 for lossless decoding (critical) functions.
-699d80e Jump-lookup for Huffman coding
-c34307a fix some VS9 warnings about type conversion
-eeada35 pngdec: add missing include
-54b6510 gif2webp: If aligning to even offsets, extra pixels should be transparent
-0bcf5ce Merge "remove a malloc() in case we're using only FILTER_NONE for alpha"
-2c07143 remove a malloc() in case we're using only FILTER_NONE for alpha
-a4d5f59 Faster lossless decoding
-fd53bb7 Merge "alternate LUT-base reverse-bits code"
-d1c166e Merge "Container spec: a clarification on background color."
-fdb9177 Rename a method
-5e96753 Container spec: a clarification on background color.
-30e77d0 Merge branch '0.3.0'
-1b631e2 alternate LUT-base reverse-bits code
-24cc307 ~20% faster lossless decoding
-313d853 Speedup for decoding lossless WebP photographs:
-24ee098 change the bytes_per_pixels_ field into more evocative use_8b_decode
-2a04b03 update ChangeLog (tag: v0.3.1-rc2, tag: v0.3.1)
 7288950 Regression fix for alpha channels using color cache:
 2e377b5 wicdec: silence a format warning
 ad9e42a muxedit: silence some uninitialized warnings
-3307c16 Don't set alpha-channel to 0xff for alpha->green uplift
-5130770 Merge "wicdec: silence a format warning"
-a37eff4 Regression fix for alpha channels using color cache:
-241cf99 Merge "muxedit: silence some uninitialized warnings"
-c8f9c84 Regression fix for alpha unfiltering:
-14cd5c6 muxedit: silence some uninitialized warnings
-a368db8 dec/vp8l: quiet vs9 x64 type conversion warning
-ffae9f3 wicdec: silence a format warning
-8cf0701 Alpha encoding: never filter in case of NO_COMPRESSION
-825e73b update ChangeLog (tag: v0.3.1-rc1)
+825e73b update ChangeLog
 abf6f69 update NEWS
 5a92c1a bump version to 0.3.1
-86daf77 store top Y/U/V samples in packed fashion
 67bc353 Revert "add WebPBlendAlpha() function to blend colors against background"
-068db59 Intertwined decoding of alpha and RGB
 38cc011 Simplify forward-WHT + SSE2 version
-3fa595a Support decoding upto given row in DECODE_DATA_FUNC
-520f005 DequantizeLevels(): Add 'row' and 'num_rows' args
-47374b8 Alpha unfilter for given set of rows
 f32097e probe input file and quick-check for WebP format.
 a2aed1d configure: improve gl/glut library test
 c7e89cb update copyright text
@@ -324,133 +54,11 @@ f4c7b65 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only i
 1fb04be pngdec: Avoid a double-free.
 dcbb1ca add WebPBlendAlpha() function to blend colors against background
 bc9f5fb configure.ac: add AM_PROG_AR for automake >= 1.12
-bf867bf Tuned cross_color parameter (step) for lower qual
-90e2ec5 Merge "probe input file and quick-check for WebP format."
-7180d7f Merge "update copyright text"
-830f72b probe input file and quick-check for WebP format.
-2ccf58d configure: improve gl/glut library test
-d640614 update copyright text
-c2113ad Merge "configure: remove use of AS_VAR_APPEND"
-9326a56 configure: remove use of AS_VAR_APPEND
-ea63d61 fix a type warning on VS9 x86
-bec1109 fix EXIF parsing in PNG
-b6e65f3 Merge "fix warnings for vs9 x64"
-438946d fix warnings for vs9 x64
-f4710e3 collect macroblock reconstruction data in VP8MBData struct
-23d28e2 add doc precision for WebPPictureCopy() and WebPPictureView()
-518f2cd cosmetics: gif2webp: fix indent
-af358e6 Merge "remove datatype qualifier for vmnv"
-3fe9163 remove datatype qualifier for vmnv
-764fdff fix a memory leak in gif2webp
-3e59a74 fix two minor memory leaks in webpmux
-47b9862 Merge "README: update swig notes"
-325d15f remove some cruft from swig/libwebp.jar
-4a7627c README: update swig notes
-5da81e3 Merge "swig/python: add minimal documentation"
-f39e08f Merge "swig: add python encode support"
-6ca4a3e Merge "swig/java: reduce wrapper function code duplication"
-8f8702b Merge "swig/java: rework uint8_t typemap"
-91413be reduce memory for VP8MB and remove bitfields use
-7413394 Fix the memory leak in ApplyFilters.
-2053c2c simplify the alpha-filter testing loop
-825b64d swig/python: add minimal documentation
-14677e1 swig: add python encode support
-a5c297c swig/java: reduce wrapper function code duplication
-ad4a367 swig/java: rework uint8_t typemap
-0d25876 use uint8_t for inv_palette[]
-afa3450 Fix the bug in ApplyPalette.
-2d6ac42 Merge "webp/lossless: fix big endian BGRA output"
-2ca8396 webp/lossless: fix big endian BGRA output
-742110c Speed up ApplyPalette for ARGB pixels.
-2451e47 misc code cleanup
-83db404 Merge "swig: add python (decode) support"
-eeeea8b Merge "swig: cosmetics"
-d5f9b8f Merge "libwebp: fix vp8 encoder mem alloc offsetting"
-d8edd83 libwebp: fix vp8 encoder mem alloc offsetting
-8983b83 remove use of bit-fields in VP8FInfo
-87a4fca remove some warnings:
-ba8f74e Merge "fix for big-endian"
-a65067f Merge "Further reduce memory to decode lossy+alpha images"
-64c8448 Further reduce memory to decode lossy+alpha images
-332130b Mux: make a few methods static
-4437061 fix for big-endian
-5199eab Merge "add uncompressed TIFF output support"
-a3aede9 add uncompressed TIFF output support
-f975b67 Merge "gif2webp: Fix signed/unsigned comparison mismatch"
-5fbc734 Merge "GetFeatures: Detect invalid VP8X/VP8/VP8L data"
-d5060c8 Merge "mux.h: A comment fix + some consistency fixes"
-352d0de GetFeatures: Detect invalid VP8X/VP8/VP8L data
-3ef79fe Cosmetic: "width * height"
-043e1ae gif2webp: Fix signed/unsigned comparison mismatch
-5818cff mux.h: A comment fix + some consistency fixes
-1153f88 Merge "swig: ifdef some Java specific code"
-3eeedae Makefile.vc: fix libwebpdemux dll variable typo
-f980faf swig: add python (decode) support
-7f5f42b swig: cosmetics
-8eae188 WebP-Lossless encoding improvements.
-c7247c4 swig: ifdef some Java specific code
-4cb234d Merge "Mux: make ValidateForSingleImage() method static"
-ed6f530 Merge "Add GetCanvasSize() method to mux"
-1d530c9 Mux: make ValidateForSingleImage() method static
-bba4c2b configure: add warning related flags
-fffefd1 Add GetCanvasSize() method to mux
-732da8d Merge "configure: add GLUT detection; build vwebp"
-0e513f7 configure: add GLUT detection; build vwebp
-55d1c15 Merge "Alpha decoding: significantly reduce memory usage"
-13d99fb Merge "configure: add --enable-everything"
-2bf698f Merge "configure.ac: add some helper macros"
-edccd19 Alpha decoding: significantly reduce memory usage
-3cafcc9 configure: add --enable-everything
-4ef1447 configure.ac: add some helper macros
-a4e1cdb Remove the gcc compilation comments
-6393fe4 Cosmetic fixes
-9c4ce97 Simplify forward-WHT + SSE2 version
-878b9da fix missed optim
-0004617 VP8GetInfo(): Check for zero width or height.
-9bf3129 align VP8Encoder::nz_ allocation
-5da165c fix CheckMode() signature
-0ece07d Merge "explicitly pad bitfields to 32-bits"
-9dbc9d1 explicitly pad bitfields to 32-bits
-5369a80 Merge "prevent signed int overflow in left shift ops"
-70e3971 Merge "cosmetics: remove unnecessary ';'s"
-d3136ce Merge "don't forward declare enums"
-b26e5ad gif2webp: Fix ICC and XMP support
-46089b2 Add missing name to AUTHORS
-94328d6 Demux: Fix a potential memleak
-96e948d don't forward declare enums
-f4f9088 prevent signed int overflow in left shift ops
-0261545 cosmetics: remove unnecessary ';'s
-7ebdf11 Merge "Fix few missing comparisons to NULL"
-1579989 Fix few missing comparisons to NULL
-ea1b21c Cleaned up VP8GetHeaders() so that it parses only frame header
-b66caee dwebp: add support for BMP output
-ff885bf add precision about dynamic output reallocation with IDecoder
-79241d5 Merge "Makefile.vc: have 'all' target build everything"
-ac1c729 Merge "Makefile.vc: flags cleanup"
-118a055 Merge "Makefile.vc: drop /FD flag"
-ecad010 Merge "update gitignore"
-a681b4f Rename PRE_VP8 state to WEBP_HEADER
-ead4d47 Add incremental support for extended format files
-69d0f92 Makefile.vc: have 'all' target build everything
-5296749 Makefile.vc: flags cleanup
-c61baf0 Makefile.vc: drop /FD flag
-3a15125 update gitignore
-5167ca4 Merge "WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded."
-67708d6 WebPEncode: An additional check. Start VP8EncLoop/VP8EncTokenLoop only if VP8EncStartAlpha succeeded.
-b68912a pngdec: Avoid a double-free.
-82abbe1 Merge "configure.ac: add AM_PROG_AR for automake >= 1.12"
-e7d9548 add WebPBlendAlpha() function to blend colors against background
-ed4dc71 configure.ac: add AM_PROG_AR for automake >= 1.12
-df4a406 Merge branch '0.3.0'
 1e0d4b8 Update ChangeLog (tag: v0.3.0-rc7, tag: v0.3.0)
 d52b405 Cosmetic fixes
 6cb4a61 misc style fix
 68111ab add missing YUVA->ARGB automatic conversion in WebPEncode()
-e9a7990 Cosmetic fixes
 403bfe8 Container spec: Clarify frame disposal
-2aaa423 Merge "add missing YUVA->ARGB automatic conversion in WebPEncode()"
-07d87bd add missing YUVA->ARGB automatic conversion in WebPEncode()
-142c462 misc style fix
 3e7a13a Merge "Container spec: clarify the background color field" into 0.3.0
 14af774 container doc: add a note about the 'ANMF' payload
 cc635ef Container spec: clarify the background color field
@@ -773,7 +381,7 @@ a61a824 Merge "Add NULL check in chunk APIs"
 a077072 mux struct naming
 6c66dde Merge "Tune Lossless encoder"
 ab5ea21 Tune Lossless encoder
-74fefc8 Update ChangeLog (tag: v0.2.1, origin/0.2.0, 0.2.0)
+74fefc8 Update ChangeLog (tag: v0.2.1, origin/0.2.0)
 92f8059 Rename some chunks:
 3bb4bbe Merge "Mux API change:"
 d0c79f0 Mux API change:
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -217,14 +217,12 @@ MUX_OBJS = \
    $(DIROBJ)\mux\muxread.obj \

 UTILS_DEC_OBJS = \
-    $(DIROBJ)\utils\alpha_processing.obj \
    $(DIROBJ)\utils\bit_reader.obj \
    $(DIROBJ)\utils\color_cache.obj \
    $(DIROBJ)\utils\filters.obj \
    $(DIROBJ)\utils\huffman.obj \
    $(DIROBJ)\utils\quant_levels_dec.obj \
    $(DIROBJ)\utils\rescaler.obj \
-    $(DIROBJ)\utils\random.obj \
    $(DIROBJ)\utils\thread.obj \
    $(DIROBJ)\utils\utils.obj \

--- a/15
+++ b/15
@@ -1,18 +1,3 @@
- 12/19/13: version 0.4.0
-  * improved gif2webp tool
-  * numerous fixes, compression improvement and speed-up
-  * dither option added to decoder (dwebp -dither 50 ...)
-  * improved multi-threaded modes (-mt option)
-  * improved filtering strength determination
-  * New function: WebPMuxGetCanvasSize
-  * BMP and TIFF format output added to 'dwebp'
-  * Significant memory reduction for decoding lossy images with alpha.
-  * Intertwined decoding of RGB and alpha for a shorter
-    time-to-first-decoded-pixel.
-  * WebPIterator has a new member 'has_alpha' denoting whether the frame
-    contains transparency.
-  * Container spec amended with new 'blending method' for animation.
-
 - 6/13/13: version 0.3.1
  This is a binary compatible release.
  * Add incremental decoding support for images containing ALPH and ICCP chunks.
--- a/57
+++ b/57
@@ -4,7 +4,7 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.4.0
+               \____/____/\_____/_____/____/v0.3.1

 Description:
 ============
@@ -80,7 +80,7 @@ more options.
 SWIG bindings:
 --------------

-To generate language bindings from swig/libwebp.swig at least swig-1.3
+To generate language bindings from swig/libwebp.i at least swig-1.3
 (http://www.swig.org) is required.

 Currently the following functions are mapped:
@@ -115,7 +115,7 @@ DLL that can be loaded via System.loadLibrary("webp_jni").
 Python bindings:

 To build the swig-generated Python extension code at least Python 2.6 is
-required. Python < 2.6 may build with some minor changes to libwebp.swig or the
+required. Python < 2.6 may build with some minor changes to libwebp.i or the
 generated code, but is untested.

 Encoding tool:
@@ -178,10 +178,6 @@ options:
  -alpha_filter <string> . predictive filtering for alpha plane.
                           One of: none, fast (default) or best.
  -alpha_cleanup ......... Clean RGB values in transparent area.
-  -blend_alpha <hex> ..... Blend colors against background color
-                           expressed as RGB values written in
-                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0
-                           green=0xe0 and blue=0xd0.
  -noalpha ............... discard any transparency information.
  -lossless .............. Encode image losslessly.
  -hint <string> ......... Specify image characteristics hint.
@@ -255,23 +251,18 @@ Decodes the WebP image file to PNG format [Default]
 Use following options to convert into alternate image formats:
  -pam ......... save the raw RGBA samples as a color PAM
  -ppm ......... save the raw RGB samples as a color PPM
-  -bmp ......... save as uncompressed BMP format
-  -tiff ........ save as uncompressed TIFF format
  -pgm ......... save the raw YUV samples as a grayscale PGM
-                 file with IMC4 layout
-  -yuv ......... save the raw YUV samples in flat layout
+                 file with IMC4 layout.
+  -yuv ......... save the raw YUV samples in flat layout.

 Other options are:
  -version  .... print version number and exit.
  -nofancy ..... don't use the fancy YUV420 upscaler.
  -nofilter .... disable in-loop filtering.
-  -nodither .... disable dithering.
-  -dither <d> .. dithering strength (in 0..100)
  -mt .......... use multi-threading
  -crop <x> <y> <w> <h> ... crop output with the given rectangle
  -scale <w> <h> .......... scale the output (*after* any cropping)
  -alpha ....... only save the alpha plane.
-  -incremental . use incremental decoding (useful for tests)
  -h     ....... this help message.
  -v     ....... verbose (e.g. print encoding/decoding times)
  -noasm ....... disable all assembly optimizations.
@@ -292,7 +283,6 @@ Options are:
  -noicc ....... don't use the icc profile if present.
  -nofancy ..... don't use the fancy YUV420 upscaler.
  -nofilter .... disable in-loop filtering.
-  -dither <int>  dithering strength (0..100). Default=50.
  -mt .......... use multi-threading.
  -info ........ print info.
  -h     ....... this help message.
@@ -328,43 +318,6 @@ $ make -f makefile.unix examples/vwebp
 > nmake /f Makefile.vc CFG=release-static \
    ../obj/x64/release-static/bin/vwebp.exe

-Animated GIF conversion:
-========================
-Animated GIF files can be converted to WebP files with animation using the
-gif2webp utility available under examples/. The files can then be viewed using
-vwebp.
-
-Usage:
- gif2webp [options] gif_file -o webp_file
-options:
-  -h / -help  ............ this help
-  -lossy ................. Encode image using lossy compression.
-  -mixed ................. For each frame in the image, pick lossy
-                           or lossless compression heuristically.
-  -q <float> ............. quality factor (0:small..100:big)
-  -m <int> ............... compression method (0=fast, 6=slowest)
-  -kmin <int> ............ Min distance between key frames
-  -kmax <int> ............ Max distance between key frames
-  -f <int> ............... filter strength (0=off..100)
-  -metadata <string> ..... comma separated list of metadata to
-                           copy from the input to the output if present.
-                           Valid values: all, none, icc, xmp (default)
-  -mt .................... use multi-threading if available
-
-  -version ............... print version number and exit.
-  -v ..................... verbose.
-  -quiet ................. don't print anything.
-
-Building:
---------
-With the libgif development files installed, gif2webp can be built using
-makefile.unix:
-$ make -f makefile.unix examples/gif2webp
-
-or using autoconf:
-$ ./configure --enable-everything
-$ make
-
 Encoding API:
 =============

--- a/README.mux
+++ b/README.mux
@@ -1,7 +1,7 @@
          __   __  ____  ____  ____  __ __  _     __ __
         /  \\/  \/  _ \/  _ \/  _ \/  \  \/ \___/_ / _\
         \       /   __/  _  \   __/      /  /  (_/  /__
-          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.2.0
+          \__\__/\_____/_____/__/  \__//_/\_____/__/___/v0.1.1


 Description:
@@ -56,12 +56,11 @@ STRIP_OPTIONS:

 FRAME_OPTIONS(i):
 Create animation.
-   file_i +di+[xi+yi[+mi[bi]]]
+   file_i +di+xi+yi+mi
   where:    'file_i' is the i'th animation frame (WebP format),
             'di' is the pause duration before next frame.
             'xi','yi' specify the image offset for this frame.
             'mi' is the dispose method for this frame (0 or 1).
-             'bi' is the blending method for this frame (+b or -b).

 LOOP_COUNT:
 Number of times to repeat the animation.
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([libwebp], [0.4.0],
+AC_INIT([libwebp], [0.3.1],
        [http://code.google.com/p/webp/issues],,
        [http://developers.google.com/speed/webp])
 AC_CANONICAL_TARGET
@@ -187,8 +187,6 @@ if test "$glut_headers" = "yes"; then
           glutMainLoop();
          ])
        ],
-        AC_DEFINE(WEBP_HAVE_GL, [1],
-                  [Set to 1 if OpenGL is supported])
        [glut_support=yes], []
      )
      if test "$glut_support" = "yes"; then
@@ -298,8 +296,6 @@ LIBCHECK_PROLOGUE([GIF])
 AC_CHECK_HEADER(gif_lib.h,
  AC_CHECK_LIB([gif], [DGifOpenFileHandle],
               [GIF_LIBS="$GIF_LIBS -lgif"
-                AC_DEFINE(WEBP_HAVE_GIF, [1],
-                          [Set to 1 if GIF library is installed])
                gif_support=yes
               ],
               AC_MSG_WARN(Optional gif library not found),
@@ -316,13 +312,7 @@ AM_CONDITIONAL([BUILD_GIF2WEBP], [test "${build_gif2webp}" = "yes"])

 dnl === check for WIC support ===

-AC_ARG_ENABLE([wic],
-              AS_HELP_STRING([--disable-wic],
-                             [Disable Windows Imaging Component (WIC) detection.
-                              @<:@default=auto@:>@]),,
-              [enable_wic=yes])
-
-if test "$target_os" = "mingw32" -a "$enable_wic" = "yes"; then
+if test "$target_os" = "mingw32"; then
  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
  if test "$ac_cv_header_wincodec_h" = "yes"; then
    AC_MSG_CHECKING(for Windows Imaging Component support)
@@ -453,8 +443,7 @@ dwebp : yes
  =====================
  PNG  : ${png_support-no}
  WIC  : ${wic_support-no}
-GIF support : ${gif_support-no}
-gif2webp    : ${build_gif2webp-no}
-webpmux     : ${enable_libwebpmux-no}
-vwebp       : ${build_vwebp-no}
+gif2webp : ${build_gif2webp-no}
+webpmux  : ${enable_libwebpmux-no}
+vwebp    : ${build_vwebp-no}
 ])
--- a/doc/webp-container-spec.txt
+++ b/doc/webp-container-spec.txt
@@ -382,17 +382,13 @@ animation.
 Background Color: 32 bits (_uint32_)

 : The default background color of the canvas in \[Blue, Green, Red, Alpha\]
-byte order. This color MAY be used to fill the unused space on the canvas around
-the frames, as well as the transparent pixels of the first frame. Background
-color is also used when disposal method is `1`.
+byte order. This color is used to fill the unused space on the canvas around the
+frames, as well as the transparent pixels of the first frame. Background color
+is also used when disposal method is `1`.

-**Note**:
-
-  * Background color MAY contain a transparency value (alpha), even if the
-    _Alpha_ flag in [VP8X chunk](#extended_header) is unset.
-
-  * Viewer applications SHOULD treat the background color value as a hint, and
-    are not required to use it.
+**Note**: Viewers that have a preferred background against which to present the
+images (web browsers, for example) should ignore this value and use their
+preferred background color instead.

 Loop Count: 16 bits (_uint16_)

@@ -419,7 +415,7 @@ If the _Animation flag_ is not set, then this chunk SHOULD NOT be present.
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    ...             |           Frame Height Minus One              |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-    |                 Frame Duration                |  Reserved |B|D|
+    |                 Frame Duration                |  Reserved   |D|
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    |                         Frame Data                            |
    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -448,24 +444,10 @@ Frame Duration: 24 bits (_uint24_)
 In particular, frame duration of 0 is useful when one wants to update multiple
 areas of the canvas at once during the animation.

-Reserved: 6 bits
+Reserved: 7 bits

 : SHOULD be 0.

-Blending method (B): 1 bit
-
-: Indicates how transparent pixels of _the current frame_ are to be blended with
-corresponding pixels of the previous canvas:
-
-  * `0`: Use alpha blending. After disposing of the previous frame, render the
-    current frame on the canvas using [alpha-blending](#alpha-blending). If the
-    current frame does not have an alpha channel, assume alpha value of 255,
-    effectively replacing the rectangle.
-
-  * `1`: Do not blend. After disposing of the previous frame, render the
-    current frame on the canvas by overwriting the rectangle covered by the
-    current frame.
-
 Disposal method (D): 1 bit

 : Indicates how _the current frame_ is to be treated after it has been displayed
@@ -477,6 +459,10 @@ Disposal method (D): 1 bit
    by the _current frame_ with background color specified in the
    [ANIM chunk](#anim_chunk).

+After disposing the current frame, render the next frame on the canvas using
+[alpha-blending](#alpha-blending). If the next frame does not have an alpha
+channel, assume alpha value of 255, effectively replacing the rectangle.
+
 **Notes**:

  * The frame disposal only applies to the _frame rectangle_, that is, the
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -30,7 +30,7 @@ cwebp_CPPFLAGS  = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE)
 cwebp_CPPFLAGS += $(JPEG_INCLUDES) $(PNG_INCLUDES) $(TIFF_INCLUDES)
 cwebp_LDADD = ../src/libwebp.la $(JPEG_LIBS) $(PNG_LIBS) $(TIFF_LIBS)

-gif2webp_SOURCES = gif2webp.c gif2webp_util.c
+gif2webp_SOURCES = gif2webp.c
 gif2webp_CPPFLAGS = $(AM_CPPFLAGS) $(USE_EXPERIMENTAL_CODE) $(GIF_INCLUDES)
 gif2webp_LDADD  = libexampleutil.la ../src/mux/libwebpmux.la ../src/libwebp.la
 gif2webp_LDADD += $(GIF_LIBS)
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@@ -31,13 +31,13 @@
 #include "./wicdec.h"

 #ifndef WEBP_DLL
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

 extern void* VP8GetCPUInfo;   // opaque forward declaration.

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  // WEBP_DLL
@@ -494,14 +494,11 @@ static int WriteWebPWithMetadata(FILE* const out,
    if (has_vp8x) {  // update the existing VP8X flags
      webp[kChunkHeaderSize] |= (uint8_t)(flags & 0xff);
      ok = ok && (fwrite(webp, kVP8XChunkSize, 1, out) == 1);
-      webp += kVP8XChunkSize;
      webp_size -= kVP8XChunkSize;
    } else {
      const int is_lossless = !memcmp(webp, "VP8L", kTagSize);
-      if (is_lossless) {
-        // Presence of alpha is stored in the 29th bit of VP8L data.
-        if (webp[kChunkHeaderSize + 3] & (1 << 5)) flags |= kAlphaFlag;
-      }
+      // The alpha flag is forced with lossless images.
+      if (is_lossless) flags |= kAlphaFlag;
      ok = ok && (fwrite(kVP8XHeader, kChunkHeaderSize, 1, out) == 1);
      ok = ok && WriteLE32(out, flags);
      ok = ok && WriteLE24(out, picture->width - 1);
@@ -599,10 +596,6 @@ static void HelpLong(void) {
  printf("  -alpha_filter <string> . predictive filtering for alpha plane.\n");
  printf("                           One of: none, fast (default) or best.\n");
  printf("  -alpha_cleanup ......... Clean RGB values in transparent area.\n");
-  printf("  -blend_alpha <hex> ..... Blend colors against background color\n"
-         "                           expressed as RGB values written in\n"
-         "                           hexadecimal, e.g. 0xc0e0d0 for red=0xc0\n"
-         "                           green=0xe0 and blue=0xd0.\n");
  printf("  -noalpha ............... discard any transparency information.\n");
  printf("  -lossless .............. Encode image losslessly.\n");
  printf("  -hint <string> ......... Specify image characteristics hint.\n");
@@ -665,8 +658,6 @@ int main(int argc, const char *argv[]) {
  int short_output = 0;
  int quiet = 0;
  int keep_alpha = 1;
-  int blend_alpha = 0;
-  uint32_t background_color = 0xffffffu;
  int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
  int resize_w = 0, resize_h = 0;
  int show_progress = 0;
@@ -731,10 +722,6 @@ int main(int argc, const char *argv[]) {
      config.alpha_compression = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-alpha_cleanup")) {
      keep_alpha = keep_alpha ? 2 : 0;
-    } else if (!strcmp(argv[c], "-blend_alpha") && c < argc - 1) {
-      blend_alpha = 1;
-      background_color = strtol(argv[++c], NULL, 16);  // <- parses '0x' prefix
-      background_color = background_color & 0x00ffffffu;
    } else if (!strcmp(argv[c], "-alpha_filter") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "none")) {
@@ -751,6 +738,7 @@ int main(int argc, const char *argv[]) {
      keep_alpha = 0;
    } else if (!strcmp(argv[c], "-lossless")) {
      config.lossless = 1;
+      picture.use_argb = 1;
    } else if (!strcmp(argv[c], "-hint") && c < argc - 1) {
      ++c;
      if (!strcmp(argv[c], "photo")) {
@@ -896,9 +884,6 @@ int main(int argc, const char *argv[]) {
 #endif
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
-    } else if (!strcmp(argv[c], "--")) {
-      if (c < argc - 1) in_file = argv[++c];
-      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      HelpLong();
@@ -933,7 +918,7 @@ int main(int argc, const char *argv[]) {

  // Read the input
  if (verbose) {
-    StopwatchReset(&stop_watch);
+    StopwatchReadAndReset(&stop_watch);
  }
  if (!ReadPicture(in_file, &picture, keep_alpha,
                   (keep_metadata == 0) ? NULL : &metadata)) {
@@ -941,11 +926,6 @@ int main(int argc, const char *argv[]) {
    goto Error;
  }
  picture.progress_hook = (show_progress && !quiet) ? ProgressReport : NULL;
-
-  if (blend_alpha) {
-    WebPBlendAlpha(&picture, background_color);
-  }
-
  if (keep_alpha == 2) {
    WebPCleanupTransparentArea(&picture);
  }
@@ -987,7 +967,7 @@ int main(int argc, const char *argv[]) {

  // Compress
  if (verbose) {
-    StopwatchReset(&stop_watch);
+    StopwatchReadAndReset(&stop_watch);
  }
  if (crop != 0) {
    // We use self-cropping using a view.
@@ -1028,32 +1008,11 @@ int main(int argc, const char *argv[]) {
    }
  }

-  if (keep_metadata != 0) {
-    if (out != NULL) {
-      if (!WriteWebPWithMetadata(out, &picture, &memory_writer,
-                                 &metadata, keep_metadata, &metadata_written)) {
-        fprintf(stderr, "Error writing WebP file with metadata!\n");
-        goto Error;
-      }
-    } else {  // output is disabled, just display the metadata stats.
-      const struct {
-        const MetadataPayload* const payload;
-        int flag;
-      } *iter, info[] = {
-        { &metadata.exif, METADATA_EXIF },
-        { &metadata.iccp, METADATA_ICC },
-        { &metadata.xmp, METADATA_XMP },
-        { NULL, 0 }
-      };
-      uint32_t unused1 = 0;
-      uint64_t unused2 = 0;
-
-      for (iter = info; iter->payload != NULL; ++iter) {
-        if (UpdateFlagsAndSize(iter->payload, !!(keep_metadata & iter->flag),
-                               0, &unused1, &unused2)) {
-          metadata_written |= iter->flag;
-        }
-      }
+  if (keep_metadata != 0 && out != NULL) {
+    if (!WriteWebPWithMetadata(out, &picture, &memory_writer,
+                               &metadata, keep_metadata, &metadata_written)) {
+      fprintf(stderr, "Error writing WebP file with metadata!\n");
+      goto Error;
    }
  }

@@ -1070,22 +1029,8 @@ int main(int argc, const char *argv[]) {
  if (!quiet && !short_output && print_distortion >= 0) {  // print distortion
    static const char* distortion_names[] = { "PSNR", "SSIM", "LSIM" };
    float values[5];
-    // Comparison is performed in YUVA colorspace.
-    if (original_picture.use_argb &&
-        !WebPPictureARGBToYUVA(&original_picture, WEBP_YUV420A)) {
-      fprintf(stderr, "Error while converting original picture to YUVA.\n");
-      goto Error;
-    }
-    if (picture.use_argb &&
-        !WebPPictureARGBToYUVA(&picture, WEBP_YUV420A)) {
-      fprintf(stderr, "Error while converting compressed picture to YUVA.\n");
-      goto Error;
-    }
-    if (!WebPPictureDistortion(&picture, &original_picture,
-                               print_distortion, values)) {
-      fprintf(stderr, "Error while computing the distortion.\n");
-      goto Error;
-    }
+    WebPPictureDistortion(&picture, &original_picture,
+                          print_distortion, values);
    fprintf(stderr, "%s: Y:%.2f U:%.2f V:%.2f A:%.2f  Total:%.2f\n",
            distortion_names[print_distortion],
            values[0], values[1], values[2], values[3], values[4]);
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -32,30 +32,24 @@
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
                         // code with COBJMACROS.
-#include <ole2.h>  // CreateStreamOnHGlobal()
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>
 #endif

-#if defined(_WIN32)
-#include <fcntl.h>   // for _O_BINARY
-#include <io.h>      // for _setmode()
-#endif
-
 #include "webp/decode.h"
 #include "./example_util.h"
 #include "./stopwatch.h"

 static int verbose = 0;
 #ifndef WEBP_DLL
-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

 extern void* VP8GetCPUInfo;   // opaque forward declaration.

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
 #endif  // WEBP_DLL
@@ -68,8 +62,6 @@ typedef enum {
  PAM,
  PPM,
  PGM,
-  BMP,
-  TIFF,
  YUV,
  ALPHA_PLANE_ONLY  // this is for experimenting only
 } OutputFileFormat;
@@ -90,15 +82,9 @@ typedef enum {
 #define MAKE_REFGUID(x) &(x)
 #endif

-static HRESULT CreateOutputStream(const char* out_file_name,
-                                  int write_to_mem, IStream** stream) {
+static HRESULT CreateOutputStream(const char* out_file_name, IStream** stream) {
  HRESULT hr = S_OK;
-  if (write_to_mem) {
-    // Output to a memory buffer. This is freed when 'stream' is released.
-    IFS(CreateStreamOnHGlobal(NULL, TRUE, stream));
-  } else {
-    IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
-  }
+  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, stream));
  if (FAILED(hr)) {
    fprintf(stderr, "Error opening output file %s (%08lx)\n",
            out_file_name, hr);
@@ -106,9 +92,8 @@ static HRESULT CreateOutputStream(const char* out_file_name,
  return hr;
 }

-static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
-                             REFGUID container_guid,
-                             uint8_t* rgb, int stride,
+static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
+                             unsigned char* rgb, int stride,
                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
  IWICImagingFactory* factory = NULL;
@@ -129,7 +114,7 @@ static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
            "Windows XP SP3 or newer?). PNG support not available. "
            "Use -ppm or -pgm for available PPM and PGM formats.\n");
  }
-  IFS(CreateOutputStream(out_file_name, use_stdout, &stream));
+  IFS(CreateOutputStream(out_file_name, &stream));
  IFS(IWICImagingFactory_CreateEncoder(factory, container_guid, NULL,
                                       &encoder));
  IFS(IWICBitmapEncoder_Initialize(encoder, stream,
@@ -143,28 +128,6 @@ static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
  IFS(IWICBitmapFrameEncode_Commit(frame));
  IFS(IWICBitmapEncoder_Commit(encoder));

-  if (SUCCEEDED(hr) && use_stdout) {
-    HGLOBAL image;
-    IFS(GetHGlobalFromStream(stream, &image));
-    if (SUCCEEDED(hr)) {
-      HANDLE std_output = GetStdHandle(STD_OUTPUT_HANDLE);
-      DWORD mode;
-      const BOOL update_mode = GetConsoleMode(std_output, &mode);
-      const void* const image_mem = GlobalLock(image);
-      DWORD bytes_written = 0;
-
-      // Clear output processing if necessary, then output the image.
-      if (update_mode) SetConsoleMode(std_output, 0);
-      if (!WriteFile(std_output, image_mem, (DWORD)GlobalSize(image),
-                     &bytes_written, NULL) ||
-          bytes_written != GlobalSize(image)) {
-        hr = E_FAIL;
-      }
-      if (update_mode) SetConsoleMode(std_output, mode);
-      GlobalUnlock(image);
-    }
-  }
-
  if (frame != NULL) IUnknown_Release(frame);
  if (encoder != NULL) IUnknown_Release(encoder);
  if (factory != NULL) IUnknown_Release(factory);
@@ -172,21 +135,21 @@ static HRESULT WriteUsingWIC(const char* out_file_name, int use_stdout,
  return hr;
 }

-static int WritePNG(const char* out_file_name, int use_stdout,
+static int WritePNG(const char* out_file_name,
                    const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  uint8_t* const rgb = buffer->u.RGBA.rgba;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_BGRA);

-  return SUCCEEDED(WriteUsingWIC(out_file_name, use_stdout,
+  return SUCCEEDED(WriteUsingWIC(out_file_name,
                                 MAKE_REFGUID(GUID_ContainerFormatPng),
                                 rgb, stride, width, height, has_alpha));
 }

 #elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
-static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
+static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }
@@ -194,7 +157,7 @@ static void PNGAPI PNGErrorFunction(png_structp png, png_const_charp dummy) {
 static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  uint8_t* const rgb = buffer->u.RGBA.rgba;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const int has_alpha = (buffer->colorspace == MODE_RGBA);
  png_structp png;
@@ -202,7 +165,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
  png_uint_32 y;

  png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
-                                NULL, PNGErrorFunction, NULL);
+                                NULL, error_function, NULL);
  if (png == NULL) {
    return 0;
  }
@@ -243,7 +206,7 @@ static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
 static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const uint8_t* const rgb = buffer->u.RGBA.rgba;
+  const unsigned char* const rgb = buffer->u.RGBA.rgba;
  const int stride = buffer->u.RGBA.stride;
  const size_t bytes_per_px = alpha ? 4 : 3;
  uint32_t y;
@@ -262,150 +225,10 @@ static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer, int alpha) {
  return 1;
 }

-static void PutLE16(uint8_t* const dst, uint32_t value) {
-  dst[0] = (value >> 0) & 0xff;
-  dst[1] = (value >> 8) & 0xff;
-}
-
-static void PutLE32(uint8_t* const dst, uint32_t value) {
-  PutLE16(dst + 0, (value >>  0) & 0xffff);
-  PutLE16(dst + 2, (value >> 16) & 0xffff);
-}
-
-#define BMP_HEADER_SIZE 54
-static int WriteBMP(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = (buffer->colorspace != MODE_BGR);
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  const uint8_t* const rgba = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
-  const uint32_t bytes_per_px = has_alpha ? 4 : 3;
-  uint32_t y;
-  const uint32_t line_size = bytes_per_px * width;
-  const uint32_t bmp_stride = (line_size + 3) & ~3;   // pad to 4
-  const uint32_t total_size = bmp_stride * height + BMP_HEADER_SIZE;
-  uint8_t bmp_header[BMP_HEADER_SIZE] = { 0 };
-
-  // bitmap file header
-  PutLE16(bmp_header + 0, 0x4d42);                // signature 'BM'
-  PutLE32(bmp_header + 2, total_size);            // size including header
-  PutLE32(bmp_header + 6, 0);                     // reserved
-  PutLE32(bmp_header + 10, BMP_HEADER_SIZE);      // offset to pixel array
-  // bitmap info header
-  PutLE32(bmp_header + 14, 40);                   // DIB header size
-  PutLE32(bmp_header + 18, width);                // dimensions
-  PutLE32(bmp_header + 22, -(int)height);         // vertical flip!
-  PutLE16(bmp_header + 26, 1);                    // number of planes
-  PutLE16(bmp_header + 28, bytes_per_px * 8);     // bits per pixel
-  PutLE32(bmp_header + 30, 0);                    // no compression (BI_RGB)
-  PutLE32(bmp_header + 34, 0);                    // image size (dummy)
-  PutLE32(bmp_header + 38, 2400);                 // x pixels/meter
-  PutLE32(bmp_header + 42, 2400);                 // y pixels/meter
-  PutLE32(bmp_header + 46, 0);                    // number of palette colors
-  PutLE32(bmp_header + 50, 0);                    // important color count
-
-  // TODO(skal): color profile
-
-  // write header
-  if (fwrite(bmp_header, sizeof(bmp_header), 1, fout) != 1) {
-    return 0;
-  }
-
-  // write pixel array
-  for (y = 0; y < height; ++y) {
-    if (fwrite(rgba + y * stride, line_size, 1, fout) != 1) {
-      return 0;
-    }
-    // write padding zeroes
-    if (bmp_stride != line_size) {
-      const uint8_t zeroes[3] = { 0 };
-      if (fwrite(zeroes, bmp_stride - line_size, 1, fout) != 1) {
-        return 0;
-      }
-    }
-  }
-  return 1;
-}
-#undef BMP_HEADER_SIZE
-
-#define NUM_IFD_ENTRIES 15
-#define EXTRA_DATA_SIZE 16
-// 10b for signature/header + n * 12b entries + 4b for IFD terminator:
-#define EXTRA_DATA_OFFSET (10 + 12 * NUM_IFD_ENTRIES + 4)
-#define TIFF_HEADER_SIZE (EXTRA_DATA_OFFSET + EXTRA_DATA_SIZE)
-
-static int WriteTIFF(FILE* fout, const WebPDecBuffer* const buffer) {
-  const int has_alpha = (buffer->colorspace != MODE_RGB);
-  const uint32_t width = buffer->width;
-  const uint32_t height = buffer->height;
-  const uint8_t* const rgba = buffer->u.RGBA.rgba;
-  const int stride = buffer->u.RGBA.stride;
-  const uint8_t bytes_per_px = has_alpha ? 4 : 3;
-  // For non-alpha case, we omit tag 0x152 (ExtraSamples).
-  const uint8_t num_ifd_entries = has_alpha ? NUM_IFD_ENTRIES
-                                            : NUM_IFD_ENTRIES - 1;
-  uint8_t tiff_header[TIFF_HEADER_SIZE] = {
-    0x49, 0x49, 0x2a, 0x00,   // little endian signature
-    8, 0, 0, 0,               // offset to the unique IFD that follows
-    // IFD (offset = 8). Entries must be written in increasing tag order.
-    num_ifd_entries, 0,       // Number of entries in the IFD (12 bytes each).
-    0x00, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  10: Width  (TBD)
-    0x01, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    //  22: Height (TBD)
-    0x02, 0x01, 3, 0, bytes_per_px, 0, 0, 0,     //  34: BitsPerSample: 8888
-        EXTRA_DATA_OFFSET + 0, 0, 0, 0,
-    0x03, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  46: Compression: none
-    0x06, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    //  58: Photometric: RGB
-    0x11, 0x01, 4, 0, 1, 0, 0, 0,                //  70: Strips offset:
-        TIFF_HEADER_SIZE, 0, 0, 0,               //      data follows header
-    0x12, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    //  82: Orientation: topleft
-    0x15, 0x01, 3, 0, 1, 0, 0, 0,                //  94: SamplesPerPixels
-        bytes_per_px, 0, 0, 0,
-    0x16, 0x01, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 106: Rows per strip (TBD)
-    0x17, 0x01, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0,    // 118: StripByteCount (TBD)
-    0x1a, 0x01, 5, 0, 1, 0, 0, 0,                // 130: X-resolution
-        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
-    0x1b, 0x01, 5, 0, 1, 0, 0, 0,                // 142: Y-resolution
-        EXTRA_DATA_OFFSET + 8, 0, 0, 0,
-    0x1c, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 154: PlanarConfiguration
-    0x28, 0x01, 3, 0, 1, 0, 0, 0, 2, 0, 0, 0,    // 166: ResolutionUnit (inch)
-    0x52, 0x01, 3, 0, 1, 0, 0, 0, 1, 0, 0, 0,    // 178: ExtraSamples: rgbA
-    0, 0, 0, 0,                                  // 190: IFD terminator
-    // EXTRA_DATA_OFFSET:
-    8, 0, 8, 0, 8, 0, 8, 0,      // BitsPerSample
-    72, 0, 0, 0, 1, 0, 0, 0      // 72 pixels/inch, for X/Y-resolution
-  };
-  uint32_t y;
-
-  // Fill placeholders in IFD:
-  PutLE32(tiff_header + 10 + 8, width);
-  PutLE32(tiff_header + 22 + 8, height);
-  PutLE32(tiff_header + 106 + 8, height);
-  PutLE32(tiff_header + 118 + 8, width * bytes_per_px * height);
-  if (!has_alpha) PutLE32(tiff_header + 178, 0);  // IFD terminator
-
-  // write header
-  if (fwrite(tiff_header, sizeof(tiff_header), 1, fout) != 1) {
-    return 0;
-  }
-  // write pixel values
-  for (y = 0; y < height; ++y) {
-    if (fwrite(rgba + y * stride, bytes_per_px, width, fout) != width) {
-      return 0;
-    }
-  }
-
-  return 1;
-}
-
-#undef TIFF_HEADER_SIZE
-#undef EXTRA_DATA_OFFSET
-#undef EXTRA_DATA_SIZE
-#undef NUM_IFD_ENTRIES
-
 static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
  const uint32_t width = buffer->width;
  const uint32_t height = buffer->height;
-  const uint8_t* const a = buffer->u.YUVA.a;
+  const unsigned char* const a = buffer->u.YUVA.a;
  const int a_stride = buffer->u.YUVA.a_stride;
  uint32_t y;
  assert(a != NULL);
@@ -466,40 +289,30 @@ static int WritePGMOrYUV(FILE* fout, const WebPDecBuffer* const buffer,
  return ok;
 }

-static int SaveOutput(const WebPDecBuffer* const buffer,
-                      OutputFileFormat format, const char* const out_file) {
+static void SaveOutput(const WebPDecBuffer* const buffer,
+                       OutputFileFormat format, const char* const out_file) {
  FILE* fout = NULL;
  int needs_open_file = 1;
-  const int use_stdout = !strcmp(out_file, "-");
  int ok = 1;
  Stopwatch stop_watch;

-  if (verbose) {
-    StopwatchReset(&stop_watch);
-  }
+  if (verbose)
+    StopwatchReadAndReset(&stop_watch);

 #ifdef HAVE_WINCODEC_H
  needs_open_file = (format != PNG);
 #endif
-
-#if defined(_WIN32)
-  if (use_stdout && _setmode(_fileno(stdout), _O_BINARY) == -1) {
-    fprintf(stderr, "Failed to reopen stdout in O_BINARY mode.\n");
-    return -1;
-  }
-#endif
-
  if (needs_open_file) {
-    fout = use_stdout ? stdout : fopen(out_file, "wb");
-    if (fout == NULL) {
+    fout = fopen(out_file, "wb");
+    if (!fout) {
      fprintf(stderr, "Error opening output file %s\n", out_file);
-      return 0;
+      return;
    }
  }

  if (format == PNG) {
 #ifdef HAVE_WINCODEC_H
-    ok &= WritePNG(out_file, use_stdout, buffer);
+    ok &= WritePNG(out_file, buffer);
 #else
    ok &= WritePNG(fout, buffer);
 #endif
@@ -507,36 +320,23 @@ static int SaveOutput(const WebPDecBuffer* const buffer,
    ok &= WritePPM(fout, buffer, 1);
  } else if (format == PPM) {
    ok &= WritePPM(fout, buffer, 0);
-  } else if (format == BMP) {
-    ok &= WriteBMP(fout, buffer);
-  } else if (format == TIFF) {
-    ok &= WriteTIFF(fout, buffer);
  } else if (format == PGM || format == YUV) {
    ok &= WritePGMOrYUV(fout, buffer, format);
  } else if (format == ALPHA_PLANE_ONLY) {
    ok &= WriteAlphaPlane(fout, buffer);
  }
-  if (fout != NULL && fout != stdout) {
+  if (fout) {
    fclose(fout);
  }
  if (ok) {
-    if (use_stdout) {
-      fprintf(stderr, "Saved to stdout\n");
-    } else {
-      fprintf(stderr, "Saved file %s\n", out_file);
-    }
+    printf("Saved file %s\n", out_file);
    if (verbose) {
      const double write_time = StopwatchReadAndReset(&stop_watch);
-      fprintf(stderr, "Time to write output: %.3fs\n", write_time);
+      printf("Time to write output: %.3fs\n", write_time);
    }
  } else {
-    if (use_stdout) {
-      fprintf(stderr, "Error writing to stdout !!\n");
-    } else {
-      fprintf(stderr, "Error writing file %s !!\n", out_file);
-    }
+    fprintf(stderr, "Error writing file %s !!\n", out_file);
  }
-  return ok;
 }

 static void Help(void) {
@@ -545,23 +345,18 @@ static void Help(void) {
         "Use following options to convert into alternate image formats:\n"
         "  -pam ......... save the raw RGBA samples as a color PAM\n"
         "  -ppm ......... save the raw RGB samples as a color PPM\n"
-         "  -bmp ......... save as uncompressed BMP format\n"
-         "  -tiff ........ save as uncompressed TIFF format\n"
         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
-         "                 file with IMC4 layout\n"
-         "  -yuv ......... save the raw YUV samples in flat layout\n"
+         "                 file with IMC4 layout.\n"
+         "  -yuv ......... save the raw YUV samples in flat layout.\n"
         "\n"
         " Other options are:\n"
         "  -version  .... print version number and exit.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
-         "  -nodither .... disable dithering.\n"
-         "  -dither <d> .. dithering strength (in 0..100)\n"
         "  -mt .......... use multi-threading\n"
         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
         "  -alpha ....... only save the alpha plane.\n"
-         "  -incremental . use incremental decoding (useful for tests)\n"
         "  -h     ....... this help message.\n"
         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
 #ifndef WEBP_DLL
@@ -575,12 +370,7 @@ static const char* const kStatusMessages[] = {
  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
 };

-static const char* const kFormatType[] = {
-  "unspecified", "lossy", "lossless"
-};
-
 int main(int argc, const char *argv[]) {
-  int ok = 0;
  const char *in_file = NULL;
  const char *out_file = NULL;

@@ -588,7 +378,6 @@ int main(int argc, const char *argv[]) {
  WebPDecBuffer* const output_buffer = &config.output;
  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
-  int incremental = 0;
  int c;

  if (!WebPInitDecoderConfig(&config)) {
@@ -612,10 +401,6 @@ int main(int argc, const char *argv[]) {
      format = PAM;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
-    } else if (!strcmp(argv[c], "-bmp")) {
-      format = BMP;
-    } else if (!strcmp(argv[c], "-tiff")) {
-      format = TIFF;
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetDecoderVersion();
      printf("%d.%d.%d\n",
@@ -627,10 +412,6 @@ int main(int argc, const char *argv[]) {
      format = YUV;
    } else if (!strcmp(argv[c], "-mt")) {
      config.options.use_threads = 1;
-    } else if (!strcmp(argv[c], "-nodither")) {
-      config.options.dithering_strength = 0;
-    } else if (!strcmp(argv[c], "-dither") && c < argc - 1) {
-      config.options.dithering_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      config.options.use_cropping = 1;
      config.options.crop_left   = strtol(argv[++c], NULL, 0);
@@ -647,11 +428,6 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-noasm")) {
      VP8GetCPUInfo = NULL;
 #endif
-    } else if (!strcmp(argv[c], "-incremental")) {
-      incremental = 1;
-    } else if (!strcmp(argv[c], "--")) {
-      if (c < argc - 1) in_file = argv[++c];
-      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Unknown option '%s'\n", argv[c]);
      Help();
@@ -670,14 +446,14 @@ int main(int argc, const char *argv[]) {
  {
    Stopwatch stop_watch;
    VP8StatusCode status = VP8_STATUS_OK;
+    int ok;
    size_t data_size = 0;
    const uint8_t* data = NULL;

    if (!ExUtilReadFile(in_file, &data, &data_size)) return -1;

-    if (verbose) {
-      StopwatchReset(&stop_watch);
-    }
+    if (verbose)
+      StopwatchReadAndReset(&stop_watch);

    status = WebPGetFeatures(data, data_size, bitstream);
    if (status != VP8_STATUS_OK) {
@@ -705,13 +481,6 @@ int main(int argc, const char *argv[]) {
      case PPM:
        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
        break;
-      case BMP:
-        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
-        break;
-      case TIFF:    // note: force pre-multiplied alpha
-        output_buffer->colorspace =
-            bitstream->has_alpha ? MODE_rgbA : MODE_RGB;
-        break;
      case PGM:
      case YUV:
        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
@@ -723,25 +492,11 @@ int main(int argc, const char *argv[]) {
        free((void*)data);
        return -1;
    }
-
-    // Decoding call.
-    if (!incremental) {
-      status = WebPDecode(data, data_size, &config);
-    } else {
-      WebPIDecoder* const idec = WebPIDecode(data, data_size, &config);
-      if (idec == NULL) {
-        fprintf(stderr, "Failed during WebPINewDecoder().\n");
-        status = VP8_STATUS_OUT_OF_MEMORY;
-        goto end;
-      } else {
-        status = WebPIUpdate(idec, data, data_size);
-        WebPIDelete(idec);
-      }
-    }
+    status = WebPDecode(data, data_size, &config);

    if (verbose) {
      const double decode_time = StopwatchReadAndReset(&stop_watch);
-      fprintf(stderr, "Time to decode picture: %.3fs\n", decode_time);
+      printf("Time to decode picture: %.3fs\n", decode_time);
    }
 end:
    free((void*)data);
@@ -749,29 +504,24 @@ int main(int argc, const char *argv[]) {
    if (!ok) {
      fprintf(stderr, "Decoding of %s failed.\n", in_file);
      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
-      goto Exit;
+      return -1;
    }
  }

-  if (out_file != NULL) {
-    fprintf(stderr, "Decoded %s. Dimensions: %d x %d %s. Format: %s. "
-                    "Now saving...\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
-    ok = SaveOutput(output_buffer, format, out_file);
+  if (out_file) {
+    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
+           output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
+    SaveOutput(output_buffer, format, out_file);
  } else {
-    fprintf(stderr, "File %s can be decoded "
-                    "(dimensions: %d x %d %s. Format: %s).\n",
-            in_file, output_buffer->width, output_buffer->height,
-            bitstream->has_alpha ? " (with alpha)" : "",
-            kFormatType[bitstream->format]);
-    fprintf(stderr, "Nothing written; "
-                    "use -o flag to save the result as e.g. PNG.\n");
+    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
+           in_file, output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
+    printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
  }
- Exit:
  WebPFreeDecBuffer(output_buffer);
-  return ok ? 0 : -1;
+
+  return 0;
 }

 //------------------------------------------------------------------------------
--- a/examples/example_util.c
+++ b/examples/example_util.c
@@ -14,6 +14,10 @@
 #include <stdio.h>
 #include <stdlib.h>

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // -----------------------------------------------------------------------------
 // File I/O

@@ -70,3 +74,6 @@ int ExUtilWriteFile(const char* const file_name,
  return ok;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/examples/example_util.h
+++ b/examples/example_util.h
@@ -15,7 +15,7 @@

 #include "webp/types.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -29,7 +29,7 @@ int ExUtilReadFile(const char* const file_name,
 int ExUtilWriteFile(const char* const file_name,
                    const uint8_t* data, size_t data_size);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/examples/gif2webp.c
+++ b/examples/gif2webp.c
@@ -14,73 +14,34 @@

 #include <assert.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>

 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif

-#ifdef WEBP_HAVE_GIF
-
 #include <gif_lib.h>
 #include "webp/encode.h"
 #include "webp/mux.h"
 #include "./example_util.h"
-#include "./gif2webp_util.h"

 #define GIF_TRANSPARENT_MASK 0x01
 #define GIF_DISPOSE_MASK     0x07
 #define GIF_DISPOSE_SHIFT    2
+#define TRANSPARENT_COLOR    0x00ffffff
 #define WHITE_COLOR          0xffffffff
-#define MAX_CACHE_SIZE       30

 //------------------------------------------------------------------------------

-static int transparent_index = -1;  // Index of transparent color in the map.
+static int transparent_index = -1;  // No transparency by default.

-static void SanitizeKeyFrameIntervals(size_t* const kmin_ptr,
-                                      size_t* const kmax_ptr) {
-  size_t kmin = *kmin_ptr;
-  size_t kmax = *kmax_ptr;
-  int print_warning = 1;
-
-  if (kmin == 0) {  // Disable keyframe insertion.
-    kmax = ~0;
-    kmin = kmax - 1;
-    print_warning = 0;
+static void ClearPicture(WebPPicture* const picture, uint32_t color) {
+  int x, y;
+  for (y = 0; y < picture->height; ++y) {
+    uint32_t* const dst = picture->argb + y * picture->argb_stride;
+    for (x = 0; x < picture->width; ++x) dst[x] = color;
  }
-  if (kmax == 0) {
-    kmax = ~0;
-    print_warning = 0;
-  }
-
-  if (kmin >= kmax) {
-    kmin = kmax - 1;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin < kmax.\n", (int)kmin);
-    }
-  } else if (kmin < (kmax / 2 + 1)) {
-    // This ensures that cache.keyframe + kmin >= kmax is always true. So, we
-    // can flush all the frames in the ‘count_since_key_frame == kmax’ case.
-    kmin = (kmax / 2 + 1);
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmin >= kmax / 2 + 1.\n",
-              (int)kmin);
-    }
-  }
-  // Limit the max number of frames that are allocated.
-  if (kmax - kmin > MAX_CACHE_SIZE) {
-    kmin = kmax - MAX_CACHE_SIZE;
-    if (print_warning) {
-      fprintf(stderr,
-              "WARNING: Setting kmin = %d, so that kmax - kmin <= 30.\n",
-              (int)kmin);
-    }
-  }
-  *kmin_ptr = kmin;
-  *kmax_ptr = kmax;
 }

 static void Remap(const uint8_t* const src, const GifFileType* const gif,
@@ -94,34 +55,30 @@ static void Remap(const uint8_t* const src, const GifFileType* const gif,

  for (i = 0; i < len; ++i) {
    const GifColorType c = colors[src[i]];
-    dst[i] = (src[i] == transparent_index) ? WEBP_UTIL_TRANSPARENT_COLOR
+    dst[i] = (src[i] == transparent_index) ? TRANSPARENT_COLOR
           : c.Blue | (c.Green << 8) | (c.Red << 16) | (0xff << 24);
  }
 }

-// Read the GIF image frame.
-static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
-                     WebPPicture* const webp_frame) {
-  WebPPicture sub_image;
+static int ReadSubImage(GifFileType* gif, WebPPicture* pic, WebPPicture* view) {
  const GifImageDesc image_desc = gif->Image;
+  const int offset_x = image_desc.Left;
+  const int offset_y = image_desc.Top;
+  const int sub_w = image_desc.Width;
+  const int sub_h = image_desc.Height;
  uint32_t* dst = NULL;
  uint8_t* tmp = NULL;
  int ok = 0;
-  WebPFrameRect rect = {
-      image_desc.Left, image_desc.Top, image_desc.Width, image_desc.Height
-  };
-  *gif_rect = rect;

  // Use a view for the sub-picture:
-  if (!WebPPictureView(webp_frame, rect.x_offset, rect.y_offset,
-                       rect.width, rect.height, &sub_image)) {
+  if (!WebPPictureView(pic, offset_x, offset_y, sub_w, sub_h, view)) {
    fprintf(stderr, "Sub-image %dx%d at position %d,%d is invalid!\n",
-            rect.width, rect.height, rect.x_offset, rect.y_offset);
-    return 0;
+            sub_w, sub_h, offset_x, offset_y);
+    goto End;
  }
-  dst = sub_image.argb;
+  dst = view->argb;

-  tmp = (uint8_t*)malloc(rect.width * sizeof(*tmp));
+  tmp = (uint8_t*)malloc(sub_w * sizeof(*tmp));
  if (tmp == NULL) goto End;

  if (image_desc.Interlace) {  // Interlaced image.
@@ -131,32 +88,32 @@ static int ReadFrame(GifFileType* const gif, WebPFrameRect* const gif_rect,
    int pass;
    for (pass = 0; pass < 4; ++pass) {
      int y;
-      for (y = interlace_offsets[pass]; y < rect.height;
-           y += interlace_jumps[pass]) {
-        if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-        Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+      for (y = interlace_offsets[pass]; y < sub_h; y += interlace_jumps[pass]) {
+        if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+        Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
      }
    }
  } else {  // Non-interlaced image.
    int y;
-    for (y = 0; y < rect.height; ++y) {
-      if (DGifGetLine(gif, tmp, rect.width) == GIF_ERROR) goto End;
-      Remap(tmp, gif, dst + y * sub_image.argb_stride, rect.width);
+    for (y = 0; y < sub_h; ++y) {
+      if (DGifGetLine(gif, tmp, sub_w) == GIF_ERROR) goto End;
+      Remap(tmp, gif, dst + y * view->argb_stride, sub_w);
    }
  }
+  // re-align the view with even offset (and adjust dimensions if needed).
+  WebPPictureView(pic, offset_x & ~1, offset_y & ~1,
+                  sub_w + (offset_x & 1), sub_h + (offset_y & 1), view);
  ok = 1;

 End:
-  if (!ok) webp_frame->error_code = sub_image.error_code;
-  WebPPictureFree(&sub_image);
  free(tmp);
  return ok;
 }

 static int GetBackgroundColor(const ColorMapObject* const color_map,
-                              int bgcolor_idx, uint32_t* const bgcolor) {
+                              GifWord bgcolor_idx, uint32_t* const bgcolor) {
  if (transparent_index != -1 && bgcolor_idx == transparent_index) {
-    *bgcolor = WEBP_UTIL_TRANSPARENT_COLOR;  // Special case.
+    *bgcolor = TRANSPARENT_COLOR;  // Special case.
    return 1;
  } else if (color_map == NULL || color_map->Colors == NULL
             || bgcolor_idx >= color_map->ColorCount) {
@@ -177,20 +134,20 @@ static void DisplayGifError(const GifFileType* const gif, int gif_error) {
 #if defined(GIFLIB_MAJOR) && defined(GIFLIB_MINOR) && \
        ((GIFLIB_MAJOR == 4 && GIFLIB_MINOR >= 2) || GIFLIB_MAJOR > 4)
 #if GIFLIB_MAJOR >= 5
-  // Static string actually, hence the const char* cast.
-  const char* error_str = (const char*)GifErrorString(
-      (gif == NULL) ? gif_error : gif->Error);
+    // Static string actually, hence the const char* cast.
+    const char* error_str = (const char*)GifErrorString(
+        (gif == NULL) ? gif_error : gif->Error);
 #else
-  const char* error_str = (const char*)GifErrorString();
-  (void)gif;
+    const char* error_str = (const char*)GifErrorString();
+    (void)gif;
 #endif
-  if (error_str == NULL) error_str = "Unknown error";
-  fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
+    if (error_str == NULL) error_str = "Unknown error";
+    fprintf(stderr, "GIFLib Error %d: %s\n", gif_error, error_str);
 #else
-  (void)gif;
-  fprintf(stderr, "GIFLib Error %d: ", gif_error);
-  PrintGifError();
-  fprintf(stderr, "\n");
+    (void)gif;
+    fprintf(stderr, "GIFLib Error %d: ", gif_error);
+    PrintGifError();
+    fprintf(stderr, "\n");
 #endif
 }

@@ -204,12 +161,6 @@ static const char* ErrorString(WebPMuxError err) {
  return kErrorMessages[-err];
 }

-enum {
-  METADATA_ICC  = (1 << 0),
-  METADATA_XMP  = (1 << 1),
-  METADATA_ALL  = METADATA_ICC | METADATA_XMP
-};
-
 //------------------------------------------------------------------------------

 static void Help(void) {
@@ -218,19 +169,9 @@ static void Help(void) {
  printf("options:\n");
  printf("  -h / -help  ............ this help\n");
  printf("  -lossy ................. Encode image using lossy compression.\n");
-  printf("  -mixed ................. For each frame in the image, pick lossy\n"
-         "                           or lossless compression heuristically.\n");
  printf("  -q <float> ............. quality factor (0:small..100:big)\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
-  printf("  -kmin <int> ............ Min distance between key frames\n");
-  printf("  -kmax <int> ............ Max distance between key frames\n");
  printf("  -f <int> ............... filter strength (0=off..100)\n");
-  printf("  -metadata <string> ..... comma separated list of metadata to\n");
-  printf("                           ");
-  printf("copy from the input to the output if present.\n");
-  printf("                           "
-         "Valid values: all, none, icc, xmp (default)\n");
-  printf("  -mt .................... use multi-threading if available\n");
  printf("\n");
  printf("  -version ............... print version number and exit.\n");
  printf("  -v ..................... verbose.\n");
@@ -248,39 +189,29 @@ int main(int argc, const char *argv[]) {
  const char *in_file = NULL, *out_file = NULL;
  FILE* out = NULL;
  GifFileType* gif = NULL;
-  WebPConfig config;
-  WebPPicture frame;
-  WebPMuxFrameInfo info;
+  WebPPicture picture;
+  WebPMuxFrameInfo frame;
  WebPMuxAnimParams anim = { WHITE_COLOR, 0 };
-  WebPFrameCache* cache = NULL;

-  int is_first_frame = 1;     // Whether we are processing the first frame.
+  int is_first_frame = 1;
  int done;
  int c;
  int quiet = 0;
+  WebPConfig config;
  WebPMux* mux = NULL;
  WebPData webp_data = { NULL, 0 };
-  int keep_metadata = METADATA_XMP;  // ICC not output by default.
  int stored_icc = 0;  // Whether we have already stored an ICC profile.
  int stored_xmp = 0;

-  int default_kmin = 1;  // Whether to use default kmin value.
-  int default_kmax = 1;
-  size_t kmin = 0;
-  size_t kmax = 0;
-  int allow_mixed = 0;   // If true, each frame can be lossy or lossless.
+  memset(&frame, 0, sizeof(frame));
+  frame.id = WEBP_CHUNK_ANMF;
+  frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;

-  memset(&info, 0, sizeof(info));
-  info.id = WEBP_CHUNK_ANMF;
-  info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
-  info.blend_method = WEBP_MUX_BLEND;
-
-  if (!WebPConfigInit(&config) || !WebPPictureInit(&frame)) {
+  if (!WebPConfigInit(&config) || !WebPPictureInit(&picture)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    return -1;
  }
  config.lossless = 1;  // Use lossless compression by default.
-  config.image_hint = WEBP_HINT_GRAPH;   // always low-color

  if (argc == 1) {
    Help();
@@ -295,62 +226,12 @@ int main(int argc, const char *argv[]) {
      out_file = argv[++c];
    } else if (!strcmp(argv[c], "-lossy")) {
      config.lossless = 0;
-    } else if (!strcmp(argv[c], "-mixed")) {
-      allow_mixed = 1;
-      config.lossless = 0;
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
      config.quality = (float)strtod(argv[++c], NULL);
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = strtol(argv[++c], NULL, 0);
-    } else if (!strcmp(argv[c], "-kmax") && c < argc - 1) {
-      kmax = strtoul(argv[++c], NULL, 0);
-      default_kmax = 0;
-    } else if (!strcmp(argv[c], "-kmin") && c < argc - 1) {
-      kmin = strtoul(argv[++c], NULL, 0);
-      default_kmin = 0;
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
      config.filter_strength = strtol(argv[++c], NULL, 0);
-    } else if (!strcmp(argv[c], "-metadata") && c < argc - 1) {
-      static const struct {
-        const char* option;
-        int flag;
-      } kTokens[] = {
-        { "all",  METADATA_ALL },
-        { "none", 0 },
-        { "icc",  METADATA_ICC },
-        { "xmp",  METADATA_XMP },
-      };
-      const size_t kNumTokens = sizeof(kTokens) / sizeof(*kTokens);
-      const char* start = argv[++c];
-      const char* const end = start + strlen(start);
-
-      keep_metadata = 0;
-      while (start < end) {
-        size_t i;
-        const char* token = strchr(start, ',');
-        if (token == NULL) token = end;
-
-        for (i = 0; i < kNumTokens; ++i) {
-          if ((size_t)(token - start) == strlen(kTokens[i].option) &&
-              !strncmp(start, kTokens[i].option, strlen(kTokens[i].option))) {
-            if (kTokens[i].flag != 0) {
-              keep_metadata |= kTokens[i].flag;
-            } else {
-              keep_metadata = 0;
-            }
-            break;
-          }
-        }
-        if (i == kNumTokens) {
-          fprintf(stderr, "Error! Unknown metadata type '%.*s'\n",
-                  (int)(token - start), start);
-          Help();
-          return -1;
-        }
-        start = token + 1;
-      }
-    } else if (!strcmp(argv[c], "-mt")) {
-      ++config.thread_level;
    } else if (!strcmp(argv[c], "-version")) {
      const int enc_version = WebPGetEncoderVersion();
      const int mux_version = WebPGetMuxVersion();
@@ -363,9 +244,6 @@ int main(int argc, const char *argv[]) {
      quiet = 1;
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
-    } else if (!strcmp(argv[c], "--")) {
-      if (c < argc - 1) in_file = argv[++c];
-      break;
    } else if (argv[c][0] == '-') {
      fprintf(stderr, "Error! Unknown option '%s'\n", argv[c]);
      Help();
@@ -374,16 +252,6 @@ int main(int argc, const char *argv[]) {
      in_file = argv[c];
    }
  }
-
-  // Appropriate default kmin, kmax values for lossy and lossless.
-  if (default_kmin) {
-    kmin = config.lossless ? 9 : 3;
-  }
-  if (default_kmax) {
-    kmax = config.lossless ? 17 : 5;
-  }
-  SanitizeKeyFrameIntervals(&kmin, &kmax);
-
  if (!WebPValidateConfig(&config)) {
    fprintf(stderr, "Error! Invalid configuration.\n");
    goto End;
@@ -404,15 +272,11 @@ int main(int argc, const char *argv[]) {
 #endif
  if (gif == NULL) goto End;

-  // Allocate current buffer
-  frame.width = gif->SWidth;
-  frame.height = gif->SHeight;
-  frame.use_argb = 1;
-  if (!WebPPictureAlloc(&frame)) goto End;
-
-  // Initialize cache
-  cache = WebPFrameCacheNew(frame.width, frame.height, kmin, kmax, allow_mixed);
-  if (cache == NULL) goto End;
+  // Allocate picture buffer
+  picture.width = gif->SWidth;
+  picture.height = gif->SHeight;
+  picture.use_argb = 1;
+    if (!WebPPictureAlloc(&picture)) goto End;

  mux = WebPMuxNew();
  if (mux == NULL) {
@@ -428,25 +292,59 @@ int main(int argc, const char *argv[]) {

    switch (type) {
      case IMAGE_DESC_RECORD_TYPE: {
-        WebPFrameRect gif_rect;
+        WebPPicture sub_image;
+        WebPMemoryWriter memory;
+
+        if (frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
+          ClearPicture(&picture, anim.bgcolor);
+        }

        if (!DGifGetImageDesc(gif)) goto End;
-        if (!ReadFrame(gif, &gif_rect, &frame)) {
+        if (!ReadSubImage(gif, &picture, &sub_image)) goto End;
+
+        if (!config.lossless) {
+          // We need to call BGRA variant because of the way we do Remap(). Note
+          // that 'sub_image' will no longer be a view and own some memory.
+          WebPPictureImportBGRA(
+              &sub_image, (uint8_t*)sub_image.argb,
+              sub_image.argb_stride * sizeof(*sub_image.argb));
+          sub_image.use_argb = 0;
+        } else {
+          sub_image.use_argb = 1;
+        }
+
+        sub_image.writer = WebPMemoryWrite;
+        sub_image.custom_ptr = &memory;
+        WebPMemoryWriterInit(&memory);
+        if (!WebPEncode(&config, &sub_image)) {
+          fprintf(stderr, "Error! Cannot encode picture as WebP\n");
+          fprintf(stderr, "Error code: %d\n", sub_image.error_code);
          goto End;
        }

-        if (!WebPFrameCacheAddFrame(cache, &config, &gif_rect, &frame, &info)) {
-          fprintf(stderr, "Error! Cannot encode frame as WebP\n");
-          fprintf(stderr, "Error code: %d\n", frame.error_code);
-        }
-
-        err = WebPFrameCacheFlush(cache, verbose, mux);
+        // Now we have all the info about the frame, as a Graphic Control
+        // Extension Block always appears before the Image Descriptor Block.
+        // So add the frame to mux.
+        frame.x_offset = gif->Image.Left & ~1;
+        frame.y_offset = gif->Image.Top & ~1;
+        frame.bitstream.bytes = memory.mem;
+        frame.bitstream.size = memory.size;
+        err = WebPMuxPushFrame(mux, &frame, 1);
        if (err != WEBP_MUX_OK) {
          fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
                  ErrorString(err));
          goto End;
        }
-        is_first_frame = 0;
+        if (verbose) {
+          printf("Added frame %dx%d (offset:%d,%d duration:%d) ",
+                 sub_image.width, sub_image.height,
+                 frame.x_offset, frame.y_offset,
+                 frame.duration);
+          printf("dispose:%d transparent index:%d\n",
+                 frame.dispose_method, transparent_index);
+        }
+        WebPDataClear(&frame.bitstream);
+        WebPPictureFree(&sub_image);
        break;
      }
      case EXTENSION_RECORD_TYPE: {
@@ -464,18 +362,14 @@ int main(int argc, const char *argv[]) {
            const int dispose = (flags >> GIF_DISPOSE_SHIFT) & GIF_DISPOSE_MASK;
            const int delay = data[2] | (data[3] << 8);  // In 10 ms units.
            if (data[0] != 4) goto End;
-            info.duration = delay * 10;  // Duration is in 1 ms units for WebP.
+            frame.duration = delay * 10;  // Duration is in 1 ms units for WebP.
            if (dispose == 3) {
-              static int warning_printed = 0;
-              if (!warning_printed) {
-                fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE unsupported.\n");
-                warning_printed = 1;
-              }
+              fprintf(stderr, "WARNING: GIF_DISPOSE_RESTORE not supported.");
              // failsafe. TODO(urvang): emulate the correct behaviour by
              // recoding the whole frame.
-              info.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+              frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
            } else {
-              info.dispose_method =
+              frame.dispose_method =
                  (dispose == 2) ? WEBP_MUX_DISPOSE_BACKGROUND
                                 : WEBP_MUX_DISPOSE_NONE;
            }
@@ -484,9 +378,10 @@ int main(int argc, const char *argv[]) {
              if (!GetBackgroundColor(gif->SColorMap, gif->SBackGroundColor,
                                      &anim.bgcolor)) {
                fprintf(stderr, "GIF decode warning: invalid background color "
-                                "index. Assuming white background.\n");
+                        "index. Assuming white background.\n");
              }
-              WebPUtilClearPic(&frame, NULL);
+              ClearPicture(&picture, anim.bgcolor);
+              is_first_frame = 0;
            }
            break;
          }
@@ -503,14 +398,11 @@ int main(int argc, const char *argv[]) {
              anim.loop_count = data[2] | (data[3] << 8);
              if (verbose) printf("Loop count: %d\n", anim.loop_count);
            } else {  // An extension containing metadata.
-              // We only store the first encountered chunk of each type, and
-              // only if requested by the user.
-              const int is_xmp = (keep_metadata & METADATA_XMP) &&
-                                 !stored_xmp &&
-                                 !memcmp(data + 1, "XMP DataXMP", 11);
-              const int is_icc = (keep_metadata & METADATA_ICC) &&
-                                 !stored_icc &&
-                                 !memcmp(data + 1, "ICCRGBG1012", 11);
+              // We only store the first encountered chunk of each type.
+              const int is_xmp =
+                  !stored_xmp && !memcmp(data + 1, "XMP DataXMP", 11);
+              const int is_icc =
+                  !stored_icc && !memcmp(data + 1, "ICCRGBG1012", 11);
              if (is_xmp || is_icc) {
                const char* const fourccs[2] = { "XMP " , "ICCP" };
                const char* const features[2] = { "XMP" , "ICC" };
@@ -594,14 +486,6 @@ int main(int argc, const char *argv[]) {
    }
  } while (!done);

-  // Flush any pending frames.
-  err = WebPFrameCacheFlushAll(cache, verbose, mux);
-  if (err != WEBP_MUX_OK) {
-    fprintf(stderr, "ERROR (%s): Could not add animation frame.\n",
-            ErrorString(err));
-    goto End;
-  }
-
  // Finish muxing
  err = WebPMuxSetAnimationParams(mux, &anim);
  if (err != WEBP_MUX_OK) {
@@ -636,8 +520,7 @@ int main(int argc, const char *argv[]) {
 End:
  WebPDataClear(&webp_data);
  WebPMuxDelete(mux);
-  WebPPictureFree(&frame);
-  WebPFrameCacheDelete(cache);
+  WebPPictureFree(&picture);
  if (out != NULL && out_file != NULL) fclose(out);

  if (gif_error != GIF_OK) {
@@ -650,14 +533,4 @@ int main(int argc, const char *argv[]) {
  return !ok;
 }

-#else  // !WEBP_HAVE_GIF
-
-int main(int argc, const char *argv[]) {
-  fprintf(stderr, "GIF support not enabled in %s.\n", argv[0]);
-  (void)argc;
-  return 0;
-}
-
-#endif
-
 //------------------------------------------------------------------------------
--- a/examples/gif2webp_util.c
+++ b/examples/gif2webp_util.c
@@ -1,667 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Helper structs and methods for gif2webp tool.
-//
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "webp/encode.h"
-#include "./gif2webp_util.h"
-
-#define DELTA_INFINITY      1ULL << 32
-#define KEYFRAME_NONE       -1
-
-//------------------------------------------------------------------------------
-// Helper utilities.
-
-static void ClearRectangle(WebPPicture* const picture,
-                           int left, int top, int width, int height) {
-  int j;
-  for (j = top; j < top + height; ++j) {
-    uint32_t* const dst = picture->argb + j * picture->argb_stride;
-    int i;
-    for (i = left; i < left + width; ++i) {
-      dst[i] = WEBP_UTIL_TRANSPARENT_COLOR;
-    }
-  }
-}
-
-void WebPUtilClearPic(WebPPicture* const picture,
-                      const WebPFrameRect* const rect) {
-  if (rect != NULL) {
-    ClearRectangle(picture, rect->x_offset, rect->y_offset,
-                   rect->width, rect->height);
-  } else {
-    ClearRectangle(picture, 0, 0, picture->width, picture->height);
-  }
-}
-
-// TODO: Also used in picture.c. Move to a common location?
-// Copy width x height pixels from 'src' to 'dst' honoring the strides.
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-// Copy pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are assumed
-// to be already allocated.
-static void CopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
-  assert(src->width == dst->width && src->height == dst->height);
-  CopyPlane((uint8_t*)src->argb, 4 * src->argb_stride, (uint8_t*)dst->argb,
-            4 * dst->argb_stride, 4 * src->width, src->height);
-}
-
-// Given 'src' picture and its frame rectangle 'rect', blend it into 'dst'.
-static void BlendPixels(const WebPPicture* const src,
-                        const WebPFrameRect* const rect,
-                        WebPPicture* const dst) {
-  int j;
-  assert(src->width == dst->width && src->height == dst->height);
-  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
-    int i;
-    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
-      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
-      const int src_alpha = src_pixel >> 24;
-      if (src_alpha != 0) {
-        dst->argb[j * dst->argb_stride + i] = src_pixel;
-      }
-    }
-  }
-}
-
-// Replace transparent pixels within 'dst_rect' of 'dst' by those in the 'src'.
-static void ReduceTransparency(const WebPPicture* const src,
-                               const WebPFrameRect* const rect,
-                               WebPPicture* const dst) {
-  int i, j;
-  assert(src != NULL && dst != NULL && rect != NULL);
-  assert(src->width == dst->width && src->height == dst->height);
-  for (j = rect->y_offset; j < rect->y_offset + rect->height; ++j) {
-    for (i = rect->x_offset; i < rect->x_offset + rect->width; ++i) {
-      const uint32_t src_pixel = src->argb[j * src->argb_stride + i];
-      const int src_alpha = src_pixel >> 24;
-      const uint32_t dst_pixel = dst->argb[j * dst->argb_stride + i];
-      const int dst_alpha = dst_pixel >> 24;
-      if (dst_alpha == 0 && src_alpha == 0xff) {
-        dst->argb[j * dst->argb_stride + i] = src_pixel;
-      }
-    }
-  }
-}
-
-// Replace similar blocks of pixels by a 'see-through' transparent block
-// with uniform average color.
-static void FlattenSimilarBlocks(const WebPPicture* const src,
-                                 const WebPFrameRect* const rect,
-                                 WebPPicture* const dst) {
-  int i, j;
-  const int block_size = 8;
-  const int y_start = (rect->y_offset + block_size) & ~(block_size - 1);
-  const int y_end = (rect->y_offset + rect->height) & ~(block_size - 1);
-  const int x_start = (rect->x_offset + block_size) & ~(block_size - 1);
-  const int x_end = (rect->x_offset + rect->width) & ~(block_size - 1);
-  assert(src != NULL && dst != NULL && rect != NULL);
-  assert(src->width == dst->width && src->height == dst->height);
-  assert((block_size & (block_size - 1)) == 0);  // must be a power of 2
-  // Iterate over each block and count similar pixels.
-  for (j = y_start; j < y_end; j += block_size) {
-    for (i = x_start; i < x_end; i += block_size) {
-      int cnt = 0;
-      int avg_r = 0, avg_g = 0, avg_b = 0;
-      int x, y;
-      const uint32_t* const psrc = src->argb + j * src->argb_stride + i;
-      uint32_t* const pdst = dst->argb + j * dst->argb_stride + i;
-      for (y = 0; y < block_size; ++y) {
-        for (x = 0; x < block_size; ++x) {
-          const uint32_t src_pixel = psrc[x + y * src->argb_stride];
-          const int alpha = src_pixel >> 24;
-          if (alpha == 0xff &&
-              src_pixel == pdst[x + y * dst->argb_stride]) {
-              ++cnt;
-              avg_r += (src_pixel >> 16) & 0xff;
-              avg_g += (src_pixel >>  8) & 0xff;
-              avg_b += (src_pixel >>  0) & 0xff;
-          }
-        }
-      }
-      // If we have a fully similar block, we replace it with an
-      // average transparent block. This compresses better in lossy mode.
-      if (cnt == block_size * block_size) {
-        const uint32_t color = (0x00          << 24) |
-                               ((avg_r / cnt) << 16) |
-                               ((avg_g / cnt) <<  8) |
-                               ((avg_b / cnt) <<  0);
-        for (y = 0; y < block_size; ++y) {
-          for (x = 0; x < block_size; ++x) {
-            pdst[x + y * dst->argb_stride] = color;
-          }
-        }
-      }
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// Key frame related utilities.
-
-// Returns true if 'curr' frame with frame rectangle 'curr_rect' is a key frame,
-// that is, it can be decoded independently of 'prev' canvas.
-static int IsKeyFrame(const WebPPicture* const curr,
-                      const WebPFrameRect* const curr_rect,
-                      const WebPPicture* const prev) {
-  int i, j;
-  int is_key_frame = 1;
-
-  // If previous canvas (with previous frame disposed) is all transparent,
-  // current frame is a key frame.
-  for (i = 0; i < prev->width; ++i) {
-    for (j = 0; j < prev->height; ++j) {
-      const uint32_t prev_alpha = (prev->argb[j * prev->argb_stride + i]) >> 24;
-      if (prev_alpha != 0) {
-        is_key_frame = 0;
-        break;
-      }
-    }
-    if (!is_key_frame) break;
-  }
-  if (is_key_frame) return 1;
-
-  // If current frame covers the whole canvas and does not contain any
-  // transparent pixels that depend on previous canvas, then current frame is
-  // a key frame.
-  if (curr_rect->width == curr->width && curr_rect->height == curr->height) {
-    assert(curr_rect->x_offset == 0 && curr_rect->y_offset == 0);
-    is_key_frame = 1;
-    for (j = 0; j < prev->height; ++j) {
-      for (i = 0; i < prev->width; ++i) {
-        const uint32_t prev_alpha =
-            (prev->argb[j * prev->argb_stride + i]) >> 24;
-        const uint32_t curr_alpha =
-            (curr->argb[j * curr->argb_stride + i]) >> 24;
-        if (curr_alpha != 0xff && prev_alpha != 0) {
-          is_key_frame = 0;
-          break;
-        }
-      }
-      if (!is_key_frame) break;
-    }
-    if (is_key_frame) return 1;
-  }
-
-  return 0;
-}
-
-// Given 'prev' frame and current frame rectangle 'rect', convert 'curr' frame
-// to a key frame.
-static void ConvertToKeyFrame(const WebPPicture* const prev,
-                              WebPFrameRect* const rect,
-                              WebPPicture* const curr) {
-  int j;
-  assert(curr->width == prev->width && curr->height == prev->height);
-
-  // Replace transparent pixels of current canvas with those from previous
-  // canvas (with previous frame disposed).
-  for (j = 0; j < curr->height; ++j) {
-    int i;
-    for (i = 0; i < curr->width; ++i) {
-      uint32_t* const curr_pixel = curr->argb + j * curr->argb_stride + i;
-      const int curr_alpha = *curr_pixel >> 24;
-      if (curr_alpha == 0) {
-        *curr_pixel = prev->argb[j * prev->argb_stride + i];
-      }
-    }
-  }
-
-  // Frame rectangle now covers the whole canvas.
-  rect->x_offset = 0;
-  rect->y_offset = 0;
-  rect->width = curr->width;
-  rect->height = curr->height;
-}
-
-//------------------------------------------------------------------------------
-// Encoded frame.
-
-// Used to store two candidates of encoded data for an animation frame. One of
-// the two will be chosen later.
-typedef struct {
-  WebPMuxFrameInfo sub_frame;  // Encoded frame rectangle.
-  WebPMuxFrameInfo key_frame;  // Encoded frame if it was converted to keyframe.
-} EncodedFrame;
-
-// Release the data contained by 'encoded_frame'.
-static void FrameRelease(EncodedFrame* const encoded_frame) {
-  if (encoded_frame != NULL) {
-    WebPDataClear(&encoded_frame->sub_frame.bitstream);
-    WebPDataClear(&encoded_frame->key_frame.bitstream);
-    memset(encoded_frame, 0, sizeof(*encoded_frame));
-  }
-}
-
-//------------------------------------------------------------------------------
-// Frame cache.
-
-// Used to store encoded frames that haven't been output yet.
-struct WebPFrameCache {
-  EncodedFrame* encoded_frames;  // Array of encoded frames.
-  size_t size;               // Number of allocated data elements.
-  size_t start;              // Start index.
-  size_t count;              // Number of valid data elements.
-  int flush_count;           // If >0, ‘flush_count’ frames starting from
-                             // 'start' are ready to be added to mux.
-  int64_t best_delta;        // min(canvas size - frame size) over the frames.
-                             // Can be negative in certain cases due to
-                             // transparent pixels in a frame.
-  int keyframe;              // Index of selected keyframe relative to 'start'.
-
-  size_t kmin;                   // Min distance between key frames.
-  size_t kmax;                   // Max distance between key frames.
-  size_t count_since_key_frame;  // Frames seen since the last key frame.
-  int allow_mixed;           // If true, each frame can be lossy or lossless.
-  WebPPicture prev_canvas;   // Previous canvas (properly disposed).
-  WebPPicture curr_canvas;   // Current canvas (temporary buffer).
-  int is_first_frame;        // True if no frames have been added to the cache
-                             // since WebPFrameCacheNew().
-};
-
-// Reset the counters in the cache struct. Doesn't touch 'cache->encoded_frames'
-// and 'cache->size'.
-static void CacheReset(WebPFrameCache* const cache) {
-  cache->start = 0;
-  cache->count = 0;
-  cache->flush_count = 0;
-  cache->best_delta = DELTA_INFINITY;
-  cache->keyframe = KEYFRAME_NONE;
-}
-
-WebPFrameCache* WebPFrameCacheNew(int width, int height,
-                                  size_t kmin, size_t kmax, int allow_mixed) {
-  WebPFrameCache* cache = (WebPFrameCache*)malloc(sizeof(*cache));
-  if (cache == NULL) return NULL;
-  CacheReset(cache);
-  // sanity init, so we can call WebPFrameCacheDelete():
-  cache->encoded_frames = NULL;
-
-  cache->is_first_frame = 1;
-
-  // Picture buffers.
-  if (!WebPPictureInit(&cache->prev_canvas) ||
-      !WebPPictureInit(&cache->curr_canvas)) {
-    return NULL;
-  }
-  cache->prev_canvas.width = width;
-  cache->prev_canvas.height = height;
-  cache->prev_canvas.use_argb = 1;
-  if (!WebPPictureAlloc(&cache->prev_canvas) ||
-      !WebPPictureCopy(&cache->prev_canvas, &cache->curr_canvas)) {
-    goto Err;
-  }
-  WebPUtilClearPic(&cache->prev_canvas, NULL);
-
-  // Cache data.
-  cache->allow_mixed = allow_mixed;
-  cache->kmin = kmin;
-  cache->kmax = kmax;
-  cache->count_since_key_frame = 0;
-  assert(kmax > kmin);
-  cache->size = kmax - kmin;
-  cache->encoded_frames =
-      (EncodedFrame*)calloc(cache->size, sizeof(*cache->encoded_frames));
-  if (cache->encoded_frames == NULL) goto Err;
-
-  return cache;  // All OK.
-
- Err:
-  WebPFrameCacheDelete(cache);
-  return NULL;
-}
-
-void WebPFrameCacheDelete(WebPFrameCache* const cache) {
-  if (cache != NULL) {
-    if (cache->encoded_frames != NULL) {
-      size_t i;
-      for (i = 0; i < cache->size; ++i) {
-        FrameRelease(&cache->encoded_frames[i]);
-      }
-      free(cache->encoded_frames);
-    }
-    WebPPictureFree(&cache->prev_canvas);
-    WebPPictureFree(&cache->curr_canvas);
-    free(cache);
-  }
-}
-
-static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
-                       WebPMemoryWriter* const memory) {
-  pic->use_argb = 1;
-  pic->writer = WebPMemoryWrite;
-  pic->custom_ptr = memory;
-  if (!WebPEncode(config, pic)) {
-    return 0;
-  }
-  return 1;
-}
-
-static void GetEncodedData(const WebPMemoryWriter* const memory,
-                           WebPData* const encoded_data) {
-  encoded_data->bytes = memory->mem;
-  encoded_data->size  = memory->size;
-}
-
-#define MIN_COLORS_LOSSY     31  // Don't try lossy below this threshold.
-#define MAX_COLORS_LOSSLESS 194  // Don't try lossless above this threshold.
-#define MAX_COLOR_COUNT     256  // Power of 2 greater than MAX_COLORS_LOSSLESS.
-#define HASH_SIZE (MAX_COLOR_COUNT * 4)
-#define HASH_RIGHT_SHIFT     22  // 32 - log2(HASH_SIZE).
-
-// TODO(urvang): Also used in enc/vp8l.c. Move to utils.
-// If the number of colors in the 'pic' is at least MAX_COLOR_COUNT, return
-// MAX_COLOR_COUNT. Otherwise, return the exact number of colors in the 'pic'.
-static int GetColorCount(const WebPPicture* const pic) {
-  int x, y;
-  int num_colors = 0;
-  uint8_t in_use[HASH_SIZE] = { 0 };
-  uint32_t colors[HASH_SIZE];
-  static const uint32_t kHashMul = 0x1e35a7bd;
-  const uint32_t* argb = pic->argb;
-  const int width = pic->width;
-  const int height = pic->height;
-  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
-
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int key;
-      if (argb[x] == last_pix) {
-        continue;
-      }
-      last_pix = argb[x];
-      key = (kHashMul * last_pix) >> HASH_RIGHT_SHIFT;
-      while (1) {
-        if (!in_use[key]) {
-          colors[key] = last_pix;
-          in_use[key] = 1;
-          ++num_colors;
-          if (num_colors >= MAX_COLOR_COUNT) {
-            return MAX_COLOR_COUNT;  // Exact count not needed.
-          }
-          break;
-        } else if (colors[key] == last_pix) {
-          break;  // The color is already there.
-        } else {
-          // Some other color sits here, so do linear conflict resolution.
-          ++key;
-          key &= (HASH_SIZE - 1);  // Key mask.
-        }
-      }
-    }
-    argb += pic->argb_stride;
-  }
-  return num_colors;
-}
-
-#undef MAX_COLOR_COUNT
-#undef HASH_SIZE
-#undef HASH_RIGHT_SHIFT
-
-static int SetFrame(const WebPConfig* const config, int allow_mixed,
-                    int is_key_frame, const WebPPicture* const prev_canvas,
-                    WebPPicture* const frame, const WebPFrameRect* const rect,
-                    const WebPMuxFrameInfo* const info,
-                    WebPPicture* const sub_frame, EncodedFrame* encoded_frame) {
-  int try_lossless;
-  int try_lossy;
-  int try_both;
-  WebPMemoryWriter mem1, mem2;
-  WebPData* encoded_data;
-  WebPMuxFrameInfo* const dst =
-      is_key_frame ? &encoded_frame->key_frame : &encoded_frame->sub_frame;
-  *dst = *info;
-  encoded_data = &dst->bitstream;
-  WebPMemoryWriterInit(&mem1);
-  WebPMemoryWriterInit(&mem2);
-
-  if (!allow_mixed) {
-    try_lossless = config->lossless;
-    try_lossy = !try_lossless;
-  } else {  // Use a heuristic for trying lossless and/or lossy compression.
-    const int num_colors = GetColorCount(sub_frame);
-    try_lossless = (num_colors < MAX_COLORS_LOSSLESS);
-    try_lossy = (num_colors >= MIN_COLORS_LOSSY);
-  }
-  try_both = try_lossless && try_lossy;
-
-  if (try_lossless) {
-    WebPConfig config_ll = *config;
-    config_ll.lossless = 1;
-    if (!EncodeFrame(&config_ll, sub_frame, &mem1)) {
-      goto Err;
-    }
-  }
-
-  if (try_lossy) {
-    WebPConfig config_lossy = *config;
-    config_lossy.lossless = 0;
-    if (!is_key_frame) {
-      // For lossy compression of a frame, it's better to replace transparent
-      // pixels of 'curr' with actual RGB values, whenever possible.
-      ReduceTransparency(prev_canvas, rect, frame);
-      // TODO(later): Investigate if this helps lossless compression as well.
-      FlattenSimilarBlocks(prev_canvas, rect, frame);
-    }
-    if (!EncodeFrame(&config_lossy, sub_frame, &mem2)) {
-      goto Err;
-    }
-  }
-
-  if (try_both) {  // Pick the encoding with smallest size.
-    // TODO(later): Perhaps a rough SSIM/PSNR produced by the encoder should
-    // also be a criteria, in addition to sizes.
-    if (mem1.size <= mem2.size) {
-      free(mem2.mem);
-      GetEncodedData(&mem1, encoded_data);
-    } else {
-      free(mem1.mem);
-      GetEncodedData(&mem2, encoded_data);
-    }
-  } else {
-    GetEncodedData(try_lossless ? &mem1 : &mem2, encoded_data);
-  }
-  return 1;
-
- Err:
-  free(mem1.mem);
-  free(mem2.mem);
-  return 0;
-}
-
-#undef MIN_COLORS_LOSSY
-#undef MAX_COLORS_LOSSLESS
-
-// Returns cached frame at given 'position' index.
-static EncodedFrame* CacheGetFrame(const WebPFrameCache* const cache,
-                                   size_t position) {
-  assert(cache->start + position < cache->size);
-  return &cache->encoded_frames[cache->start + position];
-}
-
-// Calculate the penalty incurred if we encode given frame as a key frame
-// instead of a sub-frame.
-static int64_t KeyFramePenalty(const EncodedFrame* const encoded_frame) {
-  return ((int64_t)encoded_frame->key_frame.bitstream.size -
-          encoded_frame->sub_frame.bitstream.size);
-}
-
-static void DisposeFrame(WebPMuxAnimDispose dispose_method,
-                         const WebPFrameRect* const gif_rect,
-                         WebPPicture* const frame, WebPPicture* const canvas) {
-  if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
-    WebPUtilClearPic(frame, NULL);
-    WebPUtilClearPic(canvas, gif_rect);
-  }
-}
-
-int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
-                           const WebPConfig* const config,
-                           const WebPFrameRect* const orig_rect,
-                           WebPPicture* const frame,
-                           WebPMuxFrameInfo* const info) {
-  int ok = 0;
-  WebPFrameRect rect = *orig_rect;
-  WebPPicture sub_image;  // View extracted from 'frame' with rectangle 'rect'.
-  WebPPicture* const prev_canvas = &cache->prev_canvas;
-  const size_t position = cache->count;
-  const int allow_mixed = cache->allow_mixed;
-  EncodedFrame* const encoded_frame = CacheGetFrame(cache, position);
-  assert(position < cache->size);
-
-  // Snap to even offsets (and adjust dimensions if needed).
-  rect.width += (rect.x_offset & 1);
-  rect.height += (rect.y_offset & 1);
-  rect.x_offset &= ~1;
-  rect.y_offset &= ~1;
-
-  if (!WebPPictureView(frame, rect.x_offset, rect.y_offset,
-                       rect.width, rect.height, &sub_image)) {
-    return 0;
-  }
-  info->x_offset = rect.x_offset;
-  info->y_offset = rect.y_offset;
-
-  ++cache->count;
-
-  if (cache->is_first_frame || IsKeyFrame(frame, &rect, prev_canvas)) {
-    // Add this as a key frame.
-    if (!SetFrame(config, allow_mixed, 1, NULL, NULL, NULL, info, &sub_image,
-                  encoded_frame)) {
-      goto End;
-    }
-    cache->keyframe = position;
-    cache->flush_count = cache->count;
-    cache->count_since_key_frame = 0;
-    // Update prev_canvas by simply copying from 'curr'.
-    CopyPixels(frame, prev_canvas);
-  } else {
-    ++cache->count_since_key_frame;
-    if (cache->count_since_key_frame <= cache->kmin) {
-      // Add this as a frame rectangle.
-      if (!SetFrame(config, allow_mixed, 0, prev_canvas, frame, &rect, info,
-                    &sub_image, encoded_frame)) {
-        goto End;
-      }
-      cache->flush_count = cache->count;
-      // Update prev_canvas by blending 'curr' into it.
-      BlendPixels(frame, orig_rect, prev_canvas);
-    } else {
-      WebPPicture full_image;
-      WebPMuxFrameInfo full_image_info;
-      int frame_added;
-      int64_t curr_delta;
-
-      // Add frame rectangle to cache.
-      if (!SetFrame(config, allow_mixed, 0, prev_canvas, frame, &rect, info,
-                    &sub_image, encoded_frame)) {
-        goto End;
-      }
-
-      // Convert to a key frame.
-      CopyPixels(frame, &cache->curr_canvas);
-      ConvertToKeyFrame(prev_canvas, &rect, &cache->curr_canvas);
-      if (!WebPPictureView(&cache->curr_canvas, rect.x_offset, rect.y_offset,
-                           rect.width, rect.height, &full_image)) {
-        goto End;
-      }
-      full_image_info = *info;
-      full_image_info.x_offset = rect.x_offset;
-      full_image_info.y_offset = rect.y_offset;
-
-      // Add key frame to cache, too.
-      frame_added = SetFrame(config, allow_mixed, 1, NULL, NULL, NULL,
-                             &full_image_info, &full_image, encoded_frame);
-      WebPPictureFree(&full_image);
-      if (!frame_added) goto End;
-
-      // Analyze size difference of the two variants.
-      curr_delta = KeyFramePenalty(encoded_frame);
-      if (curr_delta <= cache->best_delta) {  // Pick this as keyframe.
-        cache->keyframe = position;
-        cache->best_delta = curr_delta;
-        cache->flush_count = cache->count - 1;  // We can flush previous frames.
-      }
-      if (cache->count_since_key_frame == cache->kmax) {
-        cache->flush_count = cache->count;
-        cache->count_since_key_frame = 0;
-      }
-
-      // Update prev_canvas by simply copying from 'curr_canvas'.
-      CopyPixels(&cache->curr_canvas, prev_canvas);
-    }
-  }
-
-  DisposeFrame(info->dispose_method, orig_rect, frame, prev_canvas);
-
-  cache->is_first_frame = 0;
-  ok = 1;
-
- End:
-  WebPPictureFree(&sub_image);
-  if (!ok) {
-    FrameRelease(encoded_frame);
-    --cache->count;  // We reset the count, as the frame addition failed.
-  }
-  return ok;
-}
-
-WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
-                                 WebPMux* const mux) {
-  while (cache->flush_count > 0) {
-    WebPMuxFrameInfo* info;
-    WebPMuxError err;
-    EncodedFrame* const curr = CacheGetFrame(cache, 0);
-    // Pick frame or full canvas.
-    if (cache->keyframe == 0) {
-      info = &curr->key_frame;
-      info->blend_method = WEBP_MUX_NO_BLEND;
-      cache->keyframe = KEYFRAME_NONE;
-      cache->best_delta = DELTA_INFINITY;
-    } else {
-      info = &curr->sub_frame;
-      info->blend_method = WEBP_MUX_BLEND;
-    }
-    // Add to mux.
-    err = WebPMuxPushFrame(mux, info, 1);
-    if (err != WEBP_MUX_OK) return err;
-    if (verbose) {
-      printf("Added frame. offset:%d,%d duration:%d dispose:%d blend:%d\n",
-             info->x_offset, info->y_offset, info->duration,
-             info->dispose_method, info->blend_method);
-    }
-    FrameRelease(curr);
-    ++cache->start;
-    --cache->flush_count;
-    --cache->count;
-    if (cache->keyframe != KEYFRAME_NONE) --cache->keyframe;
-  }
-
-  if (cache->count == 0) CacheReset(cache);
-  return WEBP_MUX_OK;
-}
-
-WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
-                                    WebPMux* const mux) {
-  cache->flush_count = cache->count;  // Force flushing of all frames.
-  return WebPFrameCacheFlush(cache, verbose, mux);
-}
-
-//------------------------------------------------------------------------------
--- a/examples/gif2webp_util.h
+++ b/examples/gif2webp_util.h
@@ -1,80 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Helper structs and methods for gif2webp tool.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-#define WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
-
-#include <stdlib.h>
-
-#include "webp/mux.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//------------------------------------------------------------------------------
-// Helper utilities.
-
-#define WEBP_UTIL_TRANSPARENT_COLOR 0x00ffffff
-
-struct WebPPicture;
-
-typedef struct {
-  int x_offset, y_offset, width, height;
-} WebPFrameRect;
-
-// Clear pixels in 'picture' within given 'rect' to transparent color.
-void WebPUtilClearPic(struct WebPPicture* const picture,
-                      const WebPFrameRect* const rect);
-
-//------------------------------------------------------------------------------
-// Frame cache.
-
-typedef struct WebPFrameCache WebPFrameCache;
-
-// Given the minimum distance between key frames 'kmin' and maximum distance
-// between key frames 'kmax', returns an appropriately allocated cache object.
-// If 'allow_mixed' is true, the subsequent calls to WebPFrameCacheAddFrame()
-// will heuristically pick lossy or lossless compression for each frame.
-// Use WebPFrameCacheDelete() to deallocate the 'cache'.
-WebPFrameCache* WebPFrameCacheNew(int width, int height,
-                                  size_t kmin, size_t kmax, int allow_mixed);
-
-// Release all the frame data from 'cache' and free 'cache'.
-void WebPFrameCacheDelete(WebPFrameCache* const cache);
-
-// Given an image described by 'frame', 'info' and 'orig_rect', optimize it for
-// WebP, encode it and add it to 'cache'.
-// This takes care of frame disposal too, according to 'info->dispose_method'.
-int WebPFrameCacheAddFrame(WebPFrameCache* const cache,
-                           const WebPConfig* const config,
-                           const WebPFrameRect* const orig_rect,
-                           WebPPicture* const frame,
-                           WebPMuxFrameInfo* const info);
-
-// Flush the *ready* frames from cache and add them to 'mux'. If 'verbose' is
-// true, prints the information about these frames.
-WebPMuxError WebPFrameCacheFlush(WebPFrameCache* const cache, int verbose,
-                                 WebPMux* const mux);
-
-// Similar to 'WebPFrameCacheFlushFrames()', but flushes *all* the frames.
-WebPMuxError WebPFrameCacheFlushAll(WebPFrameCache* const cache, int verbose,
-                                    WebPMux* const mux);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  // WEBP_EXAMPLES_GIF2WEBP_UTIL_H_
--- a/examples/jpegdec.c
+++ b/examples/jpegdec.c
@@ -232,6 +232,7 @@ int ReadJPEG(FILE* in_file, WebPPicture* const pic, Metadata* const metadata) {
  jpeg_read_header(&dinfo, TRUE);

  dinfo.out_color_space = JCS_RGB;
+  dinfo.dct_method = JDCT_IFAST;
  dinfo.do_fancy_upsampling = TRUE;

  jpeg_start_decompress(&dinfo);
--- a/examples/jpegdec.h
+++ b/examples/jpegdec.h
@@ -15,7 +15,7 @@
 #include <stdio.h>
 #include "webp/types.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -28,7 +28,7 @@ struct WebPPicture;
 int ReadJPEG(FILE* in_file, struct WebPPicture* const pic,
             struct Metadata* const metadata);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/examples/metadata.h
+++ b/examples/metadata.h
@@ -15,7 +15,7 @@

 #include "webp/types.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -40,7 +40,7 @@ void MetadataFree(Metadata* const metadata);
 int MetadataCopy(const char* metadata, size_t metadata_len,
                 MetadataPayload* const payload);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/examples/pngdec.c
+++ b/examples/pngdec.c
@@ -21,13 +21,12 @@
 #include <png.h>
 #include <setjmp.h>   // note: this must be included *after* png.h
 #include <stdlib.h>
-#include <string.h>

 #include "webp/encode.h"
 #include "./metadata.h"

-static void PNGAPI error_function(png_structp png, png_const_charp error) {
-  if (error != NULL) fprintf(stderr, "libpng error: %s\n", error);
+static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
+  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }

@@ -271,7 +270,6 @@ int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha,

  pic->width = width;
  pic->height = height;
-  pic->use_argb = 1;
  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
                 : WebPPictureImportRGB(pic, rgb, stride);

--- a/examples/pngdec.h
+++ b/examples/pngdec.h
@@ -14,7 +14,7 @@

 #include <stdio.h>

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -28,7 +28,7 @@ struct WebPPicture;
 int ReadPNG(FILE* in_file, struct WebPPicture* const pic, int keep_alpha,
            struct Metadata* const metadata);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@@ -19,10 +19,6 @@

 typedef LARGE_INTEGER Stopwatch;

-static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
-  QueryPerformanceCounter(watch);
-}
-
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const LARGE_INTEGER old_value = *watch;
  LARGE_INTEGER freq;
@@ -41,10 +37,6 @@ static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {

 typedef struct timeval Stopwatch;

-static WEBP_INLINE void StopwatchReset(Stopwatch* watch) {
-  gettimeofday(watch, NULL);
-}
-
 static WEBP_INLINE double StopwatchReadAndReset(Stopwatch* watch) {
  const struct timeval old_value = *watch;
  gettimeofday(watch, NULL);
--- a/examples/tiffdec.c
+++ b/examples/tiffdec.c
@@ -100,7 +100,6 @@ int ReadTIFF(const char* const filename,
 #ifdef __BIG_ENDIAN__
      TIFFSwabArrayOfLong(raster, width * height);
 #endif
-      pic->use_argb = 1;
      ok = keep_alpha
         ? WebPPictureImportRGBA(pic, (const uint8_t*)raster, stride)
         : WebPPictureImportRGBX(pic, (const uint8_t*)raster, stride);
--- a/examples/tiffdec.h
+++ b/examples/tiffdec.h
@@ -12,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_TIFFDEC_H_
 #define WEBP_EXAMPLES_TIFFDEC_H_

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -27,7 +27,7 @@ int ReadTIFF(const char* const filename,
             struct WebPPicture* const pic, int keep_alpha,
             struct Metadata* const metadata);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/examples/vwebp.c
+++ b/examples/vwebp.c
@@ -18,8 +18,6 @@
 #include <stdlib.h>
 #include <string.h>

-#if defined(WEBP_HAVE_GL)
-
 #if defined(HAVE_GLUT_GLUT_H)
 #include <GLUT/glut.h>
 #else
@@ -59,11 +57,15 @@ static struct {

  const char* file_name;
  WebPData data;
-  WebPDecoderConfig config;
+  WebPDecoderConfig* config;
  const WebPDecBuffer* pic;
  WebPDemuxer* dmux;
-  WebPIterator curr_frame;
-  WebPIterator prev_frame;
+  WebPIterator frameiter;
+  struct {
+    int width, height;
+    int x_offset, y_offset;
+    enum WebPMuxAnimDispose dispose_method;
+  } prev_frame;
  WebPChunkIterator iccp;
 } kParams;

@@ -75,8 +77,7 @@ static void ClearPreviousPic(void) {
 static void ClearParams(void) {
  ClearPreviousPic();
  WebPDataClear(&kParams.data);
-  WebPDemuxReleaseIterator(&kParams.curr_frame);
-  WebPDemuxReleaseIterator(&kParams.prev_frame);
+  WebPDemuxReleaseIterator(&kParams.frameiter);
  WebPDemuxReleaseChunkIterator(&kParams.iccp);
  WebPDemuxDelete(kParams.dmux);
  kParams.dmux = NULL;
@@ -146,25 +147,25 @@ static int ApplyColorProfile(const WebPData* const profile,
 //------------------------------------------------------------------------------
 // File decoding

-static int Decode(void) {   // Fills kParams.curr_frame
-  const WebPIterator* const curr = &kParams.curr_frame;
-  WebPDecoderConfig* const config = &kParams.config;
+static int Decode(void) {   // Fills kParams.frameiter
+  const WebPIterator* const iter = &kParams.frameiter;
+  WebPDecoderConfig* const config = kParams.config;
  WebPDecBuffer* const output_buffer = &config->output;
  int ok = 0;

  ClearPreviousPic();
  output_buffer->colorspace = MODE_RGBA;
-  ok = (WebPDecode(curr->fragment.bytes, curr->fragment.size,
+  ok = (WebPDecode(iter->fragment.bytes, iter->fragment.size,
                   config) == VP8_STATUS_OK);
  if (!ok) {
-    fprintf(stderr, "Decoding of frame #%d failed!\n", curr->frame_num);
+    fprintf(stderr, "Decoding of frame #%d failed!\n", iter->frame_num);
  } else {
    kParams.pic = output_buffer;
    if (kParams.use_color_profile) {
      ok = ApplyColorProfile(&kParams.iccp.chunk, output_buffer);
      if (!ok) {
        fprintf(stderr, "Applying color profile to frame #%d failed!\n",
-                curr->frame_num);
+                iter->frame_num);
      }
    }
  }
@@ -175,10 +176,10 @@ static void decode_callback(int what) {
  if (what == 0 && !kParams.done) {
    int duration = 0;
    if (kParams.dmux != NULL) {
-      WebPIterator* const curr = &kParams.curr_frame;
-      if (!WebPDemuxNextFrame(curr)) {
-        WebPDemuxReleaseIterator(curr);
-        if (WebPDemuxGetFrame(kParams.dmux, 1, curr)) {
+      WebPIterator* const iter = &kParams.frameiter;
+      if (!WebPDemuxNextFrame(iter)) {
+        WebPDemuxReleaseIterator(iter);
+        if (WebPDemuxGetFrame(kParams.dmux, 1, iter)) {
          --kParams.loop_count;
          kParams.done = (kParams.loop_count == 0);
        } else {
@@ -187,7 +188,7 @@ static void decode_callback(int what) {
          return;
        }
      }
-      duration = curr->duration;
+      duration = iter->duration;
    }
    if (!Decode()) {
      kParams.decoding_error = 1;
@@ -280,45 +281,40 @@ static void DrawCheckerBoard(void) {

 static void HandleDisplay(void) {
  const WebPDecBuffer* const pic = kParams.pic;
-  const WebPIterator* const curr = &kParams.curr_frame;
-  WebPIterator* const prev = &kParams.prev_frame;
+  const WebPIterator* const iter = &kParams.frameiter;
  GLfloat xoff, yoff;
  if (pic == NULL) return;
  glPushMatrix();
  glPixelZoom(1, -1);
-  xoff = (GLfloat)(2. * curr->x_offset / kParams.canvas_width);
-  yoff = (GLfloat)(2. * curr->y_offset / kParams.canvas_height);
+  xoff = (GLfloat)(2. * iter->x_offset / kParams.canvas_width);
+  yoff = (GLfloat)(2. * iter->y_offset / kParams.canvas_height);
  glRasterPos2f(-1.f + xoff, 1.f - yoff);
  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
  glPixelStorei(GL_UNPACK_ROW_LENGTH, pic->u.RGBA.stride / 4);

-  if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ||
-      curr->blend_method == WEBP_MUX_NO_BLEND) {
+  if (kParams.prev_frame.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
    // TODO(later): these offsets and those above should factor in window size.
    //              they will be incorrect if the window is resized.
    // glScissor() takes window coordinates (0,0 at bottom left).
-    int window_x, window_y;
-    if (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
-      // Clear the previous frame rectangle.
-      window_x = prev->x_offset;
-      window_y = kParams.canvas_height - prev->y_offset - prev->height;
-    } else {  // curr->blend_method == WEBP_MUX_NO_BLEND.
-      // We simulate no-blending behavior by first clearing the current frame
-      // rectangle (to a checker-board) and then alpha-blending against it.
-      window_x = curr->x_offset;
-      window_y = kParams.canvas_height - curr->y_offset - curr->height;
-    }
+    const int window_x = kParams.prev_frame.x_offset;
+    const int window_y = kParams.canvas_height -
+                         kParams.prev_frame.y_offset -
+                         kParams.prev_frame.height;
    glEnable(GL_SCISSOR_TEST);
-    // Only update the requested area, not the whole canvas.
-    glScissor(window_x, window_y, prev->width, prev->height);
+    // Only updated the requested area, not the whole canvas.
+    glScissor(window_x, window_y,
+              kParams.prev_frame.width, kParams.prev_frame.height);

    glClear(GL_COLOR_BUFFER_BIT);  // use clear color
    DrawCheckerBoard();

    glDisable(GL_SCISSOR_TEST);
  }
-
-  *prev = *curr;
+  kParams.prev_frame.width = iter->width;
+  kParams.prev_frame.height = iter->height;
+  kParams.prev_frame.x_offset = iter->x_offset;
+  kParams.prev_frame.y_offset = iter->y_offset;
+  kParams.prev_frame.dispose_method = iter->dispose_method;

  glDrawPixels(pic->width, pic->height,
               GL_RGBA, GL_UNSIGNED_BYTE,
@@ -334,9 +330,9 @@ static void HandleDisplay(void) {
    glColor4f(0.90f, 0.0f, 0.90f, 1.0f);
    glRasterPos2f(-0.95f, 0.80f);
    PrintString(tmp);
-    if (curr->x_offset != 0 || curr->y_offset != 0) {
+    if (iter->x_offset != 0 || iter->y_offset != 0) {
      snprintf(tmp, sizeof(tmp), " (offset:%d,%d)",
-               curr->x_offset, curr->y_offset);
+               iter->x_offset, iter->y_offset);
      glRasterPos2f(-0.95f, 0.70f);
      PrintString(tmp);
    }
@@ -376,7 +372,6 @@ static void Help(void) {
         "  -noicc ....... don't use the icc profile if present.\n"
         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
         "  -nofilter .... disable in-loop filtering.\n"
-         "  -dither <int>  dithering strength (0..100). Default=50.\n"
         "  -mt .......... use multi-threading.\n"
         "  -info ........ print info.\n"
         "  -h     ....... this help message.\n"
@@ -389,16 +384,14 @@ static void Help(void) {
 }

 int main(int argc, char *argv[]) {
+  WebPDecoderConfig config;
  int c;
-  WebPDecoderConfig* const config = &kParams.config;
-  WebPIterator* const curr = &kParams.curr_frame;
-  WebPIterator* const prev = &kParams.prev_frame;

-  if (!WebPInitDecoderConfig(config)) {
+  if (!WebPInitDecoderConfig(&config)) {
    fprintf(stderr, "Library version mismatch!\n");
    return -1;
  }
-  config->options.dithering_strength = 50;
+  kParams.config = &config;
  kParams.use_color_profile = 1;

  for (c = 1; c < argc; ++c) {
@@ -408,11 +401,9 @@ int main(int argc, char *argv[]) {
    } else if (!strcmp(argv[c], "-noicc")) {
      kParams.use_color_profile = 0;
    } else if (!strcmp(argv[c], "-nofancy")) {
-      config->options.no_fancy_upsampling = 1;
+      config.options.no_fancy_upsampling = 1;
    } else if (!strcmp(argv[c], "-nofilter")) {
-      config->options.bypass_filtering = 1;
-    } else if (!strcmp(argv[c], "-dither") && c + 1 < argc) {
-      config->options.dithering_strength = strtol(argv[++c], NULL, 0);
+      config.options.bypass_filtering = 1;
    } else if (!strcmp(argv[c], "-info")) {
      kParams.print_info = 1;
    } else if (!strcmp(argv[c], "-version")) {
@@ -424,10 +415,7 @@ int main(int argc, char *argv[]) {
             (dmux_version >> 8) & 0xff, dmux_version & 0xff);
      return 0;
    } else if (!strcmp(argv[c], "-mt")) {
-      config->options.use_threads = 1;
-    } else if (!strcmp(argv[c], "--")) {
-      if (c < argc - 1) kParams.file_name = argv[++c];
-      break;
+      config.options.use_threads = 1;
    } else if (argv[c][0] == '-') {
      printf("Unknown option '%s'\n", argv[c]);
      Help();
@@ -469,10 +457,10 @@ int main(int argc, char *argv[]) {
    printf("Canvas: %d x %d\n", kParams.canvas_width, kParams.canvas_height);
  }

-  prev->width = kParams.canvas_width;
-  prev->height = kParams.canvas_height;
-  prev->x_offset = prev->y_offset = 0;
-  prev->dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;
+  kParams.prev_frame.width = kParams.canvas_width;
+  kParams.prev_frame.height = kParams.canvas_height;
+  kParams.prev_frame.x_offset = kParams.prev_frame.y_offset = 0;
+  kParams.prev_frame.dispose_method = WEBP_MUX_DISPOSE_BACKGROUND;

  memset(&kParams.iccp, 0, sizeof(kParams.iccp));
  kParams.has_color_profile =
@@ -488,20 +476,20 @@ int main(int argc, char *argv[]) {
 #endif
  }

-  if (!WebPDemuxGetFrame(kParams.dmux, 1, curr)) goto Error;
+  if (!WebPDemuxGetFrame(kParams.dmux, 1, &kParams.frameiter)) goto Error;

-  kParams.has_animation = (curr->num_frames > 1);
+  kParams.has_animation = (kParams.frameiter.num_frames > 1);
  kParams.loop_count = (int)WebPDemuxGetI(kParams.dmux, WEBP_FF_LOOP_COUNT);
  kParams.bg_color = WebPDemuxGetI(kParams.dmux, WEBP_FF_BACKGROUND_COLOR);
  printf("VP8X: Found %d images in file (loop count = %d)\n",
-         curr->num_frames, kParams.loop_count);
+         kParams.frameiter.num_frames, kParams.loop_count);

  // Decode first frame
  if (!Decode()) goto Error;

  // Position iterator to last frame. Next call to HandleDisplay will wrap over.
  // We take this into account by bumping up loop_count.
-  WebPDemuxGetFrame(kParams.dmux, 0, curr);
+  WebPDemuxGetFrame(kParams.dmux, 0, &kParams.frameiter);
  if (kParams.loop_count) ++kParams.loop_count;

  // Start display (and timer)
@@ -523,14 +511,4 @@ int main(int argc, char *argv[]) {
  return -1;
 }

-#else   // !WEBP_HAVE_GL
-
-int main(int argc, const char *argv[]) {
-  fprintf(stderr, "OpenGL support not enabled in %s.\n", argv[0]);
-  (void)argc;
-  return 0;
-}
-
-#endif
-
 //------------------------------------------------------------------------------
--- a/examples/webpmux.c
+++ b/examples/webpmux.c
@@ -53,7 +53,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "webp/decode.h"
 #include "webp/mux.h"
 #include "./example_util.h"

@@ -146,6 +145,12 @@ static const char* ErrorString(WebPMuxError err) {
    return err;                                                      \
  }

+#define RETURN_IF_ERROR2(ERR_MSG, FORMAT_STR)                        \
+  if (err != WEBP_MUX_OK) {                                          \
+    fprintf(stderr, ERR_MSG, FORMAT_STR);                            \
+    return err;                                                      \
+  }
+
 #define RETURN_IF_ERROR3(ERR_MSG, FORMAT_STR1, FORMAT_STR2)          \
  if (err != WEBP_MUX_OK) {                                          \
    fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);              \
@@ -167,21 +172,16 @@ static const char* ErrorString(WebPMuxError err) {
  } while (0)

 #define ERROR_GOTO3(ERR_MSG, FORMAT_STR1, FORMAT_STR2, LABEL)        \
-  do {                                                               \
-    fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);              \
-    ok = 0;                                                          \
-    goto LABEL;                                                      \
-  } while (0)
+   do {                                                              \
+     fprintf(stderr, ERR_MSG, FORMAT_STR1, FORMAT_STR2);             \
+     ok = 0;                                                         \
+     goto LABEL;                                                     \
+   } while (0)

 static WebPMuxError DisplayInfo(const WebPMux* mux) {
-  int width, height;
  uint32_t flag;

-  WebPMuxError err = WebPMuxGetCanvasSize(mux, &width, &height);
-  assert(err == WEBP_MUX_OK);  // As WebPMuxCreate() was successful earlier.
-  printf("Canvas size: %d x %d\n", width, height);
-
-  err = WebPMuxGetFeatures(mux, &flag);
+  WebPMuxError err = WebPMuxGetFeatures(mux, &flag);
 #ifndef WEBP_EXPERIMENTAL_FEATURES
  if (flag & FRAGMENTS_FLAG) err = WEBP_MUX_INVALID_ARGUMENT;
 #endif
@@ -211,40 +211,26 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
    if (is_anim) {
      WebPMuxAnimParams params;
      err = WebPMuxGetAnimationParams(mux, &params);
-      assert(err == WEBP_MUX_OK);
+      RETURN_IF_ERROR("Failed to retrieve animation parameters\n");
      printf("Background color : 0x%.8X  Loop Count : %d\n",
             params.bgcolor, params.loop_count);
    }

    err = WebPMuxNumChunks(mux, id, &nFrames);
-    assert(err == WEBP_MUX_OK);
+    RETURN_IF_ERROR2("Failed to retrieve number of %ss\n", type_str);

    printf("Number of %ss: %d\n", type_str, nFrames);
    if (nFrames > 0) {
      int i;
-      printf("No.: width height alpha x_offset y_offset ");
-      if (is_anim) printf("duration   dispose blend ");
+      printf("No.: x_offset y_offset ");
+      if (is_anim) printf("duration dispose ");
      printf("image_size\n");
      for (i = 1; i <= nFrames; i++) {
        WebPMuxFrameInfo frame;
        err = WebPMuxGetFrame(mux, i, &frame);
        if (err == WEBP_MUX_OK) {
-          WebPBitstreamFeatures features;
-          const VP8StatusCode status = WebPGetFeatures(
-              frame.bitstream.bytes, frame.bitstream.size, &features);
-          assert(status == VP8_STATUS_OK);  // Checked by WebPMuxCreate().
-          (void)status;
-          printf("%3d: %5d %5d %5s %8d %8d ", i, features.width,
-                 features.height, features.has_alpha ? "yes" : "no",
-                 frame.x_offset, frame.y_offset);
-          if (is_anim) {
-            const char* const dispose =
-                (frame.dispose_method == WEBP_MUX_DISPOSE_NONE) ? "none"
-                                                                : "background";
-            const char* const blend =
-                (frame.blend_method == WEBP_MUX_BLEND) ? "yes" : "no";
-            printf("%8d %10s %5s ", frame.duration, dispose, blend);
-          }
+          printf("%3d: %8d %8d ", i, frame.x_offset, frame.y_offset);
+          if (is_anim) printf("%8d %7d ", frame.duration, frame.dispose_method);
          printf("%10d\n", (int)frame.bitstream.size);
        }
        WebPDataClear(&frame.bitstream);
@@ -256,21 +242,21 @@ static WebPMuxError DisplayInfo(const WebPMux* mux) {
  if (flag & ICCP_FLAG) {
    WebPData icc_profile;
    err = WebPMuxGetChunk(mux, "ICCP", &icc_profile);
-    assert(err == WEBP_MUX_OK);
+    RETURN_IF_ERROR("Failed to retrieve the ICC profile\n");
    printf("Size of the ICC profile data: %d\n", (int)icc_profile.size);
  }

  if (flag & EXIF_FLAG) {
    WebPData exif;
    err = WebPMuxGetChunk(mux, "EXIF", &exif);
-    assert(err == WEBP_MUX_OK);
+    RETURN_IF_ERROR("Failed to retrieve the EXIF metadata\n");
    printf("Size of the EXIF metadata: %d\n", (int)exif.size);
  }

  if (flag & XMP_FLAG) {
    WebPData xmp;
    err = WebPMuxGetChunk(mux, "XMP ", &xmp);
-    assert(err == WEBP_MUX_OK);
+    RETURN_IF_ERROR("Failed to retrieve the XMP metadata\n");
    printf("Size of the XMP metadata: %d\n", (int)xmp.size);
  }

@@ -342,13 +328,11 @@ static void PrintHelp(void) {
  printf("\n");
  printf("FRAME_OPTIONS(i):\n");
  printf(" Create animation.\n");
-  printf("   file_i +di+[xi+yi[+mi[bi]]]\n");
+  printf("   file_i +di+xi+yi+mi\n");
  printf("   where:    'file_i' is the i'th animation frame (WebP format),\n");
  printf("             'di' is the pause duration before next frame.\n");
  printf("             'xi','yi' specify the image offset for this frame.\n");
  printf("             'mi' is the dispose method for this frame (0 or 1).\n");
-  printf("             'bi' is the blending method for this frame (+b or -b)."
-         "\n");

  printf("\n");
  printf("LOOP_COUNT:\n");
@@ -425,33 +409,22 @@ static int WriteWebP(WebPMux* const mux, const char* filename) {

 static int ParseFrameArgs(const char* args, WebPMuxFrameInfo* const info) {
  int dispose_method, dummy;
-  char plus_minus, blend_method;
-  const int num_args = sscanf(args, "+%d+%d+%d+%d%c%c+%d", &info->duration,
-                              &info->x_offset, &info->y_offset, &dispose_method,
-                              &plus_minus, &blend_method, &dummy);
+  const int num_args = sscanf(args, "+%d+%d+%d+%d+%d",
+                              &info->duration, &info->x_offset, &info->y_offset,
+                              &dispose_method, &dummy);
  switch (num_args) {
    case 1:
      info->x_offset = info->y_offset = 0;  // fall through
    case 3:
      dispose_method = 0;  // fall through
    case 4:
-      plus_minus = '+';
-      blend_method = 'b';  // fall through
-    case 6:
      break;
-    case 2:
-    case 5:
    default:
      return 0;
  }
  // Note: The sanity of the following conversion is checked by
-  // WebPMuxPushFrame().
+  // WebPMuxSetAnimationParams().
  info->dispose_method = (WebPMuxAnimDispose)dispose_method;
-
-  if (blend_method != 'b') return 0;
-  if (plus_minus != '-' && plus_minus != '+') return 0;
-  info->blend_method =
-      (plus_minus == '+') ? WEBP_MUX_BLEND : WEBP_MUX_NO_BLEND;
  return 1;
 }

@@ -677,17 +650,6 @@ static int ParseCommandLine(int argc, const char* argv[],
               (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
        DeleteConfig(config);
        exit(0);
-      } else if (!strcmp(argv[i], "--")) {
-        if (i < argc - 1) {
-          ++i;
-          if (config->input_ == NULL) {
-            config->input_ = argv[i];
-          } else {
-            ERROR_GOTO2("ERROR at '%s': Multiple input files specified.\n",
-                        argv[i], ErrParse);
-          }
-        }
-        break;
      } else {
        ERROR_GOTO2("ERROR: Unknown option: '%s'.\n", argv[i], ErrParse);
      }
--- a/examples/wicdec.c
+++ b/examples/wicdec.c
@@ -308,7 +308,6 @@ int ReadPictureWithWIC(const char* const filename,
    int ok;
    pic->width = width;
    pic->height = height;
-    pic->use_argb = 1;
    ok = importer->import(pic, rgb, stride);
    if (!ok) hr = E_FAIL;
  }
--- a/examples/wicdec.h
+++ b/examples/wicdec.h
@@ -12,7 +12,7 @@
 #ifndef WEBP_EXAMPLES_WICDEC_H_
 #define WEBP_EXAMPLES_WICDEC_H_

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -27,7 +27,7 @@ int ReadPictureWithWIC(const char* const filename,
                       struct WebPPicture* const pic, int keep_alpha,
                       struct Metadata* const metadata);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/iosbuild.sh
+++ b/iosbuild.sh
@@ -15,9 +15,6 @@ set -e
 declare -r SDK=$(xcodebuild -showsdks \
  | grep iphoneos | sort | tail -n 1 | awk '{print substr($NF, 9)}'
 )
-# Extract Xcode version.
-declare -r XCODE=$(xcodebuild -version | grep Xcode | cut -d " " -f2)
-
 declare -r OLDPATH=${PATH}

 # Add iPhoneOS-V6 to the list of platforms below if you need armv6 support.
@@ -66,24 +63,12 @@ for PLATFORM in ${PLATFORMS}; do
  ROOTDIR="${BUILDDIR}/${PLATFORM}-${SDK}-${ARCH}"
  mkdir -p "${ROOTDIR}"

-  SDKROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/SDKs/${PLATFORM}${SDK}.sdk/"
-  CFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
-  LDFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
+  export DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer"
+  export SDKROOT="${DEVROOT}/SDKs/${PLATFORM}${SDK}.sdk"

-  if [[ -z "${XCODE}" ]]; then
-    echo "XCODE not available"
-    exit 1
-  elif [[ ${SDK} < 5.0.0 ]]; then
-    DEVROOT="${PLATFORMSROOT}/${PLATFORM}.platform/Developer/"
-  else
-    DEVROOT="${DEVELOPER}/Toolchains/XcodeDefault.xctoolchain"
-    CFLAGS+=" -miphoneos-version-min=5.0"
-    LDFLAGS+=" -miphoneos-version-min=5.0"
-  fi
-
-  export CFLAGS
-  export LDFLAGS
+  export CFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
  export CXXFLAGS=${CFLAGS}
+  export LDFLAGS="-arch ${ARCH} -pipe -isysroot ${SDKROOT}"
  export PATH="${DEVROOT}/usr/bin:${OLDPATH}"

  ${SRCDIR}/configure --host=${ARCH}-apple-darwin --prefix=${ROOTDIR} \
--- a/makefile.unix
+++ b/makefile.unix
@@ -141,22 +141,17 @@ EX_FORMAT_DEC_OBJS = \
 EX_UTIL_OBJS = \
    examples/example_util.o \

-GIF2WEBP_UTIL_OBJS = \
-    examples/gif2webp_util.o \
-
 MUX_OBJS = \
    src/mux/muxedit.o \
    src/mux/muxinternal.o \
    src/mux/muxread.o \

 UTILS_DEC_OBJS = \
-    src/utils/alpha_processing.o \
    src/utils/bit_reader.o \
    src/utils/color_cache.o \
    src/utils/filters.o \
    src/utils/huffman.o \
    src/utils/quant_levels_dec.o \
-    src/utils/random.o \
    src/utils/rescaler.o \
    src/utils/thread.o \
    src/utils/utils.o \
@@ -181,7 +176,6 @@ HDRS_INSTALLED = \
    src/webp/types.h \

 HDRS = \
-    src/dec/alphai.h \
    src/dec/decode_vp8.h \
    src/dec/vp8i.h \
    src/dec/vp8li.h \
@@ -191,7 +185,6 @@ HDRS = \
    src/dsp/yuv.h \
    src/enc/cost.h \
    src/enc/vp8enci.h \
-    src/utils/alpha_processing.h \
    src/utils/bit_reader.h \
    src/utils/bit_writer.h \
    src/utils/color_cache.h \
@@ -200,7 +193,6 @@ HDRS = \
    src/utils/huffman_encode.h \
    src/utils/quant_levels.h \
    src/utils/quant_levels_dec.h \
-    src/utils/random.h \
    src/utils/rescaler.h \
    src/utils/thread.h \
    src/webp/format_constants.h \
@@ -214,7 +206,6 @@ OUTPUT = $(OUT_LIBS) $(OUT_EXAMPLES)
 ifeq ($(MAKECMDGOALS),clean)
  OUTPUT += $(EXTRA_EXAMPLES)
  OUTPUT += src/demux/libwebpdemux.a src/mux/libwebpmux.a
-  OUTPUT += examples/libgif2webp_util.a
 endif

 ex: $(OUT_EXAMPLES)
@@ -226,7 +217,6 @@ $(EX_FORMAT_DEC_OBJS): %.o: %.h
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@

 examples/libexample_util.a: $(EX_UTIL_OBJS)
-examples/libgif2webp_util.a: $(GIF2WEBP_UTIL_OBJS)
 src/libwebpdecoder.a: $(LIBWEBPDECODER_OBJS)
 src/libwebp.a: $(LIBWEBP_OBJS)
 src/mux/libwebpmux.a: $(LIBWEBPMUX_OBJS)
@@ -245,14 +235,11 @@ examples/cwebp: src/libwebp.a
 examples/cwebp: EXTRA_LIBS += $(CWEBP_LIBS)
 examples/dwebp: examples/libexample_util.a src/libwebpdecoder.a
 examples/dwebp: EXTRA_LIBS += $(DWEBP_LIBS)
-examples/gif2webp: examples/libexample_util.a examples/libgif2webp_util.a
-examples/gif2webp: src/mux/libwebpmux.a src/libwebp.a
+examples/gif2webp: examples/libexample_util.a src/mux/libwebpmux.a src/libwebp.a
 examples/gif2webp: EXTRA_LIBS += $(GIF_LIBS)
-examples/gif2webp: EXTRA_FLAGS += -DWEBP_HAVE_GIF
 examples/vwebp: examples/libexample_util.a src/demux/libwebpdemux.a
 examples/vwebp: src/libwebp.a
 examples/vwebp: EXTRA_LIBS += $(GL_LIBS)
-examples/vwebp: EXTRA_FLAGS += -DWEBP_HAVE_GL
 examples/webpmux: examples/libexample_util.a src/mux/libwebpmux.a
 examples/webpmux: src/libwebpdecoder.a

@@ -263,8 +250,8 @@ dist: DESTDIR := dist
 dist: OUT_EXAMPLES += $(EXTRA_EXAMPLES)
 dist: all
 	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
-	           $(DESTDIR)/bin $(DESTDIR)/doc $(DESTDIR)/lib
-	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)/bin
+	           $(DESTDIR)/doc $(DESTDIR)/lib
+	$(INSTALL) -m755 -s $(OUT_EXAMPLES) $(DESTDIR)
 	$(INSTALL) -m644 $(HDRS_INSTALLED) $(DESTDIR)/include/webp
 	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
 	$(INSTALL) -m644 src/demux/libwebpdemux.a $(DESTDIR)/lib
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "December 12, 2013"
+.TH CWEBP 1 "March 13, 2013"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@@ -73,7 +73,7 @@ trade off between encoding speed and the compressed file size and quality.
 Possible values range from 0 to 6. Default value is 4.
 When higher values are used, the encoder will spend more time inspecting
 additional encoding possibilities and decide on the quality gain.
-Lower value can result in faster processing time at the expense of
+Lower value can result is faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
 .B \-jpeg_like
@@ -153,11 +153,6 @@ close as possible to this target.
 Set a maximum number of passes to use during the dichotomy used by
 options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
-.BI \-resize " width height
-Resize the source to a rectangle with size \fBwidth\fP x \fBheight\fP.
-If either (but not both) of the \fBwidth\fP or \fBheight\fP parameters is 0,
-the value will be calculated preserving the aspect-ratio.
-.TP
 .BI \-crop " x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
 (\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
@@ -173,9 +168,8 @@ Output additional ASCII-map of encoding information. Possible map values
 range from 1 to 6. This is only meant to help debugging.
 .TP
 .BI \-pre " int
-Specify some pre-processing steps. Using a value of '2' will trigger
-quality-dependent pseudo-random dithering during RGBA->YUVA conversion
-(lossy compression only).
+Specify a pre-processing filter. This option is a placeholder
+and has currently no effect.
 .TP
 .BI \-alpha_filter " string
 Specify the predictive filtering method for the alpha plane. One of 'none',
@@ -193,11 +187,6 @@ no compression, 1 uses WebP lossless format for compression. The default is 1.
 Modify unseen RGB values under fully transparent area, to help compressibility.
 The default is off.
 .TP
-.BI \-blend_alpha " int
-This option blends the alpha channel (if present) with the source using the
-background color specified in hexadecimal as 0xrrggbb. The alpha channel is
-afterward reset to the opaque value 255.
-.TP
 .B \-noalpha
 Using this option will discard the alpha channel.
 .TP
@@ -255,8 +244,6 @@ cwebp \-q 50 -lossless picture.png \-o picture_lossless.webp
 cwebp \-q 70 picture_with_alpha.png \-o picture_with_alpha.webp
 .br
 cwebp \-sns 70 \-f 50 \-size 60000 picture.png \-o picture.webp
-.br
-cwebp \-o picture.webp \-\- \-\-\-picture.png

 .SH AUTHORS
 \fBcwebp\fP was written by the WebP team.
@@ -268,7 +255,7 @@ for the Debian project (and may be used by others).

 .SH SEE ALSO
 .BR dwebp (1),
-.BR gif2webp (1)
+.BR gif2webp (1).
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "December 12, 2013"
+.TH DWEBP 1 "February 01, 2013"
 .SH NAME
 dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
@@ -23,13 +23,6 @@ Print the version number (as major.minor.revision) and exit.
 .TP
 .BI \-o " string
 Specify the name of the output file (as PNG format by default).
-Using "-" as output name will direct output to 'stdout'.
-.TP
-.B \-bmp
-Change the output format to uncompressed BMP.
-.TP
-.B \-tiff
-Change the output format to uncompressed TIFF.
 .TP
 .B \-pam
 Change the output format to PAM (retains alpha).
@@ -39,7 +32,7 @@ Change the output format to PPM (discards alpha).
 .TP
 .B \-pgm
 Change the output format to PGM. The output consists of luma/chroma
-samples instead of RGB, using the IMC4 layout. This option is mainly
+samples instead of RGB, using the ICM4 layout. This option is mainly
 for verification and debugging purposes.
 .TP
 .B \-yuv
@@ -55,15 +48,7 @@ edges (especially the red ones), but should be faster.
 .B \-nofilter
 Don't use the in-loop filtering process even if it is required by
 the bitstream. This may produce visible blocks on the non-compliant output,
-but it will make the decoding faster.
-.TP
-.B \-dither " strength
-Specify a dithering \fBstrength\fP between 0 and 100. Dithering is a
-post-processing effect applied to chroma components in lossy compression.
-It helps by smoothing gradients and avoiding banding artifacts.
-.TP
-.B \-nodither
-Disable all dithering (default).
+but will make the decoding faster.
 .TP
 .B \-mt
 Use multi-threading for decoding, if possible.
@@ -99,8 +84,6 @@ http://www.webmproject.org/code/contribute/submitting-patches/
 dwebp picture.webp \-o output.png
 .br
 dwebp picture.webp \-ppm \-o output.ppm
-.br
-dwebp \-o output.ppm \-\- \-\-\-picture.webp

 .SH AUTHORS
 \fBdwebp\fP was written by the WebP team.
@@ -112,8 +95,8 @@ for the Debian project (and may be used by others).

 .SH SEE ALSO
 .BR cwebp (1),
-.BR gif2webp (1),
-.BR webpmux (1)
+.BR webpmux (1),
+.BR gif2webp (1).
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/gif2webp.1
+++ b/man/gif2webp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH GIF2WEBP 1 "December 17, 2013"
+.TH GIF2WEBP 1 "February 01, 2013"
 .SH NAME
 gif2webp \- Convert a GIF image to WebP
 .SH SYNOPSIS
@@ -28,10 +28,6 @@ Print the version number (as major.minor.revision) and exit.
 .B \-lossy
 Encode the image using lossy compression.
 .TP
-.B \-mixed
-Mixed compression mode: optimize compression of the image by picking either
-lossy or lossless compression for each frame heuristically.
-.TP
 .BI \-q " float
 Specify the compression factor for RGB channels between 0 and 100. The default
 is 75.
@@ -53,43 +49,6 @@ additional encoding possibilities and decide on the quality gain.
 Lower value can result is faster processing time at the expense of
 larger file size and lower compression quality.
 .TP
-.BI \-kmin " int
-.TP
-.BI \-kmax " int
-Specify the minimum and maximum distance between consecutive key frames
-(independently decodable frames) in the output animation. The tool will insert
-some key frames into the output animation as needed so that this criteria is
-satisfied.
-.br
-A 'kmin' value of 0 will turn off insertion of key frames.
-Typical values are in the range 3 to 30. Default values are kmin = 9,
-kmax = 17 for lossless compression and kmin = 3, kmax = 5 for lossy compression.
-.br
-These two options are relevant only for animated images with large number of
-frames (>50).
-.br
-When lower values are used, more frames will be converted to key frames. This
-may lead to smaller number of frames required to decode a frame on average,
-thereby improving the decoding performance. But this may lead to slightly bigger
-file sizes.
-Higher values may lead to worse decoding performance, but smaller file sizes.
-.br
-Some restrictions:
-.br
-(i) kmin < kmax,
-.br
-(ii) kmin >= kmax / 2 + 1 and
-.br
-(iii) kmax - kmin <= 30.
-.br
-If any of these restrictions are not met, they will be enforced automatically.
-.TP
-.BI \-metadata " string
-A comma separated list of metadata to copy from the input to the output if
-present.
-Valid values: \fBall\fP, \fBnone\fP, \fBicc\fP, \fBxmp\fP.
-The default is \fBxmp\fP.
-.TP
 .BI \-f " int
 For lossy encoding only (specified by the \-lossy option). Specify the strength
 of the deblocking filter, between 0 (no filtering) and 100 (maximum filtering).
@@ -98,10 +57,6 @@ strength of the filtering process applied after decoding the picture. The higher
 the value the smoother the picture will appear. Typical values are usually in
 the range of 20 to 50.
 .TP
-.B \-mt
-Use multi-threading for encoding, if possible. This option is only effective
-when using lossy compression.
-.TP
 .B \-v
 Print extra information.
 .TP
@@ -123,8 +78,6 @@ gif2webp \-q 70 picture.gif \-o picture.webp
 gif2webp \-lossy \-m 3 picture.gif \-o picture_lossy.webp
 .br
 gif2webp \-lossy \-f 50 picture.gif \-o picture.webp
-.br
-gif2webp \-q 70 \-o picture.webp \-\- \-\-\-picture.gif

 .SH AUTHORS
 \fBgif2webp\fP was written by the WebP team.
@@ -135,9 +88,9 @@ This manual page was written by Urvang Joshi <urvang@google.com>, for the
 Debian project (and may be used by others).

 .SH SEE ALSO
-.BR cwebp (1),
 .BR dwebp (1),
-.BR webpmux (1)
+.BR cwebp (1),
+.BR webpmux (1).
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/man/webpmux.1
+++ b/man/webpmux.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH WEBPMUX 1 "December 17, 2013"
+.TH WEBPMUX 1 "March 16, 2013"
 .SH NAME
 webpmux \- command line tool to create WebP Mux/container file.
 .SH SYNOPSIS
@@ -92,14 +92,12 @@ Strip XMP metadata.

 .SS FRAME_OPTIONS (\-frame)
 .TP
-.I file_i +di[+xi+yi[+mi[bi]]]
+.I file_i +di[+xi+yi[+mi]]
 Where: 'file_i' is the i'th frame (WebP format), 'xi','yi' specify the image
-offset for this frame, 'di' is the pause duration before next frame, 'mi' is
-the dispose method for this frame (0 for NONE or 1 for BACKGROUND) and 'bi' is
-the blending method for this frame (+b for BLEND or -b for NO_BLEND).
-Argument 'bi' can be omitted and will default to +b (BLEND).
-Also, 'mi' can be omitted if 'bi' is omitted and will default to 0 (NONE).
-Finally, if 'mi' and 'bi' are omitted then 'xi' and 'yi' can be omitted and will
+offset for this frame, 'di' is the pause duration before next frame and 'mi' is
+the dispose method for this frame (0 for NONE or 1 for BACKGROUND).
+'mi' can be omitted and will default to 0 (NONE).
+Additionally, if 'mi' is ommitted then'xi' and 'yi' can be omitted and will
 default to +0+0.
 .TP
 .BI \-loop " n
@@ -151,22 +149,13 @@ webpmux \-get exif exif_container.webp \-o image_metadata.exif
 .br
 webpmux \-strip exif exif_container.webp \-o without_exif.webp
 .br
-webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50
+webpmux \-frame anim_1.webp +100 \-frame anim_2.webp +100+50+50 \-loop 10
 .br
 .RS 8
-\-frame anim_2.webp +100+50+50+1+b \-loop 10 \-bgcolor 255,255,255,255
-.br
-.RS 8
-\-o anim_container.webp
+\-bgcolor 255,255,255,255 \-o anim_container.webp
 .RE
 .br
 webpmux \-get frame 2 anim_container.webp \-o frame_2.webp
-.br
-webpmux \-set icc image_profile.icc \-o icc_container.webp \-\- \-\-\-in.webp
-.br
-webpmux \-get icc \-o image_profile.icc \-\- \-\-\-icc_container.webp
-.br
-webpmux \-strip icc \-o without_icc.webp \-\- \-\-\-icc_container.webp

 .SH AUTHORS
 \fBwebpmux\fP is written by the WebP team.
@@ -177,9 +166,9 @@ This manual page was written by Vikas Arora <vikaas.arora@gmail.com>,
 for the Debian project (and may be used by others).

 .SH SEE ALSO
-.BR cwebp (1),
 .BR dwebp (1),
-.BR gif2webp (1)
+.BR cwebp (1),
+.BR gif2webp (1).
 .br
 Please refer to http://developers.google.com/speed/webp/ for additional
 information.
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -36,7 +36,7 @@ libwebp_la_LIBADD += utils/libwebputils.la
 # other than the ones listed on the command line, i.e., after linking, it will
 # not have unresolved symbols. Some platforms (Windows among them) require all
 # symbols in shared libraries to be resolved at library creation.
-libwebp_la_LDFLAGS = -no-undefined -version-info 5:0:0
+libwebp_la_LDFLAGS = -no-undefined -version-info 4:3:0
 libwebpincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebp.pc

@@ -48,7 +48,7 @@ if BUILD_LIBWEBPDECODER
  libwebpdecoder_la_LIBADD += dsp/libwebpdspdecode.la
  libwebpdecoder_la_LIBADD += utils/libwebputilsdecode.la

-  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 1:0:0
+  libwebpdecoder_la_LDFLAGS = -no-undefined -version-info 0:1:0
  pkgconfig_DATA += libwebpdecoder.pc
 endif

--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@@ -3,7 +3,6 @@ noinst_LTLIBRARIES = libwebpdecode.la

 libwebpdecode_la_SOURCES =
 libwebpdecode_la_SOURCES += alpha.c
-libwebpdecode_la_SOURCES += alphai.h
 libwebpdecode_la_SOURCES += buffer.c
 libwebpdecode_la_SOURCES += decode_vp8.h
 libwebpdecode_la_SOURCES += frame.c
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@@ -12,150 +12,104 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
+#include "../utils/filters.h"
 #include "../utils/quant_levels_dec.h"
 #include "../webp/format_constants.h"

-//------------------------------------------------------------------------------
-// ALPHDecoder object.
-
-ALPHDecoder* ALPHNew(void) {
-  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1, sizeof(*dec));
-  return dec;
-}
-
-void ALPHDelete(ALPHDecoder* const dec) {
-  if (dec != NULL) {
-    VP8LDelete(dec->vp8l_dec_);
-    dec->vp8l_dec_ = NULL;
-    free(dec);
-  }
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 //------------------------------------------------------------------------------
-// Decoding.
+// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// The 'output' buffer should be pre-allocated and must be of the same
+// dimension 'height'x'width', as that of the image.
+//
+// Returns 1 on successfully decoding the compressed alpha and
+//         0 if either:
+//           error in bit-stream header (invalid compression mode or filter), or
+//           error returned by appropriate compression method.

-// Initialize alpha decoding by parsing the alpha header and decoding the image
-// header for alpha data stored using lossless compression.
-// Returns false in case of error in alpha header (data too short, invalid
-// compression method or filter, error in lossless header data etc).
-static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
-                    size_t data_size, int width, int height, uint8_t* output) {
+static int DecodeAlpha(const uint8_t* data, size_t data_size,
+                       int width, int height, uint8_t* output) {
+  WEBP_FILTER_TYPE filter;
+  int pre_processing;
+  int rsrv;
  int ok = 0;
+  int method;
  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
-  int rsrv;

  assert(width > 0 && height > 0);
  assert(data != NULL && output != NULL);

-  dec->width_ = width;
-  dec->height_ = height;
-
  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
  }

-  dec->method_ = (data[0] >> 0) & 0x03;
-  dec->filter_ = (data[0] >> 2) & 0x03;
-  dec->pre_processing_ = (data[0] >> 4) & 0x03;
+  method = (data[0] >> 0) & 0x03;
+  filter = (data[0] >> 2) & 0x03;
+  pre_processing = (data[0] >> 4) & 0x03;
  rsrv = (data[0] >> 6) & 0x03;
-  if (dec->method_ < ALPHA_NO_COMPRESSION ||
-      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
-      dec->filter_ >= WEBP_FILTER_LAST ||
-      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
+  if (method < ALPHA_NO_COMPRESSION ||
+      method > ALPHA_LOSSLESS_COMPRESSION ||
+      filter >= WEBP_FILTER_LAST ||
+      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
      rsrv != 0) {
    return 0;
  }

-  if (dec->method_ == ALPHA_NO_COMPRESSION) {
-    const size_t alpha_decoded_size = dec->width_ * dec->height_;
+  if (method == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = height * width;
    ok = (alpha_data_size >= alpha_decoded_size);
+    if (ok) memcpy(output, alpha_data, alpha_decoded_size);
  } else {
-    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
-    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
+    ok = VP8LDecodeAlphaImageStream(width, height, alpha_data, alpha_data_size,
+                                    output);
  }
+
+  if (ok) {
+    WebPUnfilterFunc unfilter_func = WebPUnfilters[filter];
+    if (unfilter_func != NULL) {
+      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
+      // and apply filter per image-row.
+      unfilter_func(width, height, width, output);
+    }
+    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
+      ok = DequantizeLevels(output, width, height);
+    }
+  }
+
  return ok;
 }

-// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
-// starting from row number 'row'. It assumes that rows up to (row - 1) have
-// already been decoded.
-// Returns false in case of bitstream error.
-static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
-  ALPHDecoder* const alph_dec = dec->alph_dec_;
-  const int width = alph_dec->width_;
-  const int height = alph_dec->height_;
-  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
-  uint8_t* const output = dec->alpha_plane_;
-  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
-    const size_t offset = row * width;
-    const size_t num_pixels = num_rows * width;
-    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
-    memcpy(dec->alpha_plane_ + offset,
-           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
-  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
-    assert(alph_dec->vp8l_dec_ != NULL);
-    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
-      return 0;
-    }
-  }
-
-  if (unfilter_func != NULL) {
-    unfilter_func(width, height, width, row, num_rows, output);
-  }
-
-  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
-    if (!DequantizeLevels(output, width, height, row, num_rows)) {
-      return 0;
-    }
-  }
-
-  if (row + num_rows == dec->pic_hdr_.height_) {
-    dec->is_alpha_decoded_ = 1;
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
-// Main entry point.

 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows) {
  const int width = dec->pic_hdr_.width_;
  const int height = dec->pic_hdr_.height_;

-  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
+  if (row < 0 || num_rows < 0 || row + num_rows > height) {
    return NULL;    // sanity check.
  }

  if (row == 0) {
-    // Initialize decoding.
-    assert(dec->alpha_plane_ != NULL);
-    dec->alph_dec_ = ALPHNew();
-    if (dec->alph_dec_ == NULL) return NULL;
-    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
-                  width, height, dec->alpha_plane_)) {
-      ALPHDelete(dec->alph_dec_);
-      dec->alph_dec_ = NULL;
-      return NULL;
+    // Decode everything during the first call.
+    assert(!dec->is_alpha_decoded_);
+    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+                     width, height, dec->alpha_plane_)) {
+      return NULL;  // Error.
    }
-  }
-
-  if (!dec->is_alpha_decoded_) {
-    int ok = 0;
-    assert(dec->alph_dec_ != NULL);
-    ok = ALPHDecode(dec, row, num_rows);
-    if (!ok || dec->is_alpha_decoded_) {
-      ALPHDelete(dec->alph_dec_);
-      dec->alph_dec_ = NULL;
-    }
-    if (!ok) return NULL;  // Error.
+    dec->is_alpha_decoded_ = 1;
  }

  // Return a pointer to the current decoded row.
  return dec->alpha_plane_ + row * width;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/alphai.h
+++ b/src/dec/alphai.h
@@ -1,55 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Alpha decoder: internal header.
-//
-// Author: Urvang (urvang@google.com)
-
-#ifndef WEBP_DEC_ALPHAI_H_
-#define WEBP_DEC_ALPHAI_H_
-
-#include "./webpi.h"
-#include "../utils/filters.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP8LDecoder;  // Defined in dec/vp8li.h.
-
-typedef struct ALPHDecoder ALPHDecoder;
-struct ALPHDecoder {
-  int width_;
-  int height_;
-  int method_;
-  WEBP_FILTER_TYPE filter_;
-  int pre_processing_;
-  struct VP8LDecoder* vp8l_dec_;
-  VP8Io io_;
-  int use_8b_decode;  // Although alpha channel requires only 1 byte per
-                      // pixel, sometimes VP8LDecoder may need to allocate
-                      // 4 bytes per pixel internally during decode.
-};
-
-//------------------------------------------------------------------------------
-// internal functions. Not public.
-
-// Allocates a new alpha decoder instance.
-ALPHDecoder* ALPHNew(void);
-
-// Clears and deallocates an alpha decoder instance.
-void ALPHDelete(ALPHDecoder* const dec);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@@ -17,6 +17,10 @@
 #include "./webpi.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // WebPDecBuffer

@@ -208,3 +212,6 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/decode_vp8.h
+++ b/src/dec/decode_vp8.h
@@ -16,7 +16,7 @@

 #include "../webp/decode.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -132,8 +132,7 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }

-// Decode the VP8 frame header. Returns true if ok.
-// Note: 'io->data' must be pointing to the start of the VP8 frame header.
+// Start decoding a new picture. Returns true if ok.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);

 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@@ -178,7 +177,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -15,10 +15,11 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"

-#define ALIGN_MASK (32 - 1)
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+#define ALIGN_MASK (32 - 1)

 //------------------------------------------------------------------------------
 // Filtering
@@ -30,18 +31,25 @@ static void ReconstructRow(const VP8Decoder* const dec,
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };

+static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
+  if (keyframe) {
+    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+  } else {
+    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
+  }
+}
+
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int cache_id = ctx->id_;
  const int y_bps = dec->cache_y_stride_;
-  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
+  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  const int level = f_info->f_level_;
  const int ilevel = f_info->f_ilevel_;
-  const int limit = f_info->f_limit_;
-  if (limit == 0) {
+  const int limit = 2 * level + ilevel;
+  if (level == 0) {
    return;
  }
-  assert(limit >= 3);
  if (dec->filter_type_ == 1) {   // simple
    if (mb_x > 0) {
      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@@ -57,9 +65,10 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
    }
  } else {    // complex
    const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh = f_info->hev_thresh_;
+    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh =
+        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
    if (mb_x > 0) {
      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@@ -119,107 +128,25 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
          }
        }
        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-        if (level > 0) {
-          int ilevel = level;
-          if (hdr->sharpness_ > 0) {
-            if (hdr->sharpness_ > 4) {
-              ilevel >>= 2;
-            } else {
-              ilevel >>= 1;
-            }
-            if (ilevel > 9 - hdr->sharpness_) {
-              ilevel = 9 - hdr->sharpness_;
-            }
+        info->f_level_ = level;
+
+        if (hdr->sharpness_ > 0) {
+          if (hdr->sharpness_ > 4) {
+            level >>= 2;
+          } else {
+            level >>= 1;
+          }
+          if (level > 9 - hdr->sharpness_) {
+            level = 9 - hdr->sharpness_;
          }
-          if (ilevel < 1) ilevel = 1;
-          info->f_ilevel_ = ilevel;
-          info->f_limit_ = 2 * level + ilevel;
-          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-        } else {
-          info->f_limit_ = 0;  // no filtering
        }
-        info->f_inner_ = i4x4;
+        info->f_ilevel_ = (level < 1) ? 1 : level;
+        info->f_inner_ = 0;
      }
    }
  }
 }

-//------------------------------------------------------------------------------
-// Dithering
-
-#define DITHER_AMP_TAB_SIZE 12
-static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
-  // roughly, it's dqm->uv_mat_[1]
-  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
-};
-
-void VP8InitDithering(const WebPDecoderOptions* const options,
-                      VP8Decoder* const dec) {
-  assert(dec != NULL);
-  if (options != NULL) {
-    const int d = options->dithering_strength;
-    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
-    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
-    if (f > 0) {
-      int s;
-      int all_amp = 0;
-      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8QuantMatrix* const dqm = &dec->dqm_[s];
-        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
-          // TODO(skal): should we specially dither more for uv_quant_ < 0?
-          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
-          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
-        }
-        all_amp |= dqm->dither_;
-      }
-      if (all_amp != 0) {
-        VP8InitRandom(&dec->dithering_rg_, 1.0f);
-        dec->dither_ = 1;
-      }
-    }
-  }
-}
-
-// minimal amp that will provide a non-zero dithering effect
-#define MIN_DITHER_AMP 4
-#define DITHER_DESCALE 4
-#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
-#define DITHER_AMP_BITS 8
-#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
-
-static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
-  int i, j;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) {
-      // TODO: could be made faster with SSE2
-      const int bits =
-          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
-      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
-      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
-      const int v = (int)dst[i] + delta;
-      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
-    }
-    dst += bps;
-  }
-}
-
-static void DitherRow(VP8Decoder* const dec) {
-  int mb_x;
-  assert(dec->dither_);
-  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
-    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-    const VP8MBData* const data = ctx->mb_data_ + mb_x;
-    const int cache_id = ctx->id_;
-    const int uv_bps = dec->cache_uv_stride_;
-    if (data->dither_ >= MIN_DITHER_AMP) {
-      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
-      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
-      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
-      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 // This function is called after a row of macroblocks is finished decoding.
 // It also takes into account the following restrictions:
@@ -237,35 +164,25 @@ static void DitherRow(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int cache_id = ctx->id_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int mb_y = ctx->mb_y_;
-  const int is_first_row = (mb_y == 0);
-  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
-
-  if (dec->mt_method_ == 2) {
-    ReconstructRow(dec, ctx);
-  }
+  const int first_row = (ctx->mb_y_ == 0);
+  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);

  if (ctx->filter_row_) {
    FilterRow(dec);
  }

-  if (dec->dither_) {
-    DitherRow(dec);
-  }
-
-  if (io->put != NULL) {
-    int y_start = MACROBLOCK_VPOS(mb_y);
-    int y_end = MACROBLOCK_VPOS(mb_y + 1);
-    if (!is_first_row) {
+  if (io->put) {
+    if (!first_row) {
      y_start -= extra_y_rows;
      io->y = ydst;
      io->u = udst;
@@ -276,7 +193,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
      io->v = dec->cache_v_ + uv_offset;
    }

-    if (!is_last_row) {
+    if (!last_row) {
      y_end -= extra_y_rows;
    }
    if (y_end > io->crop_bottom) {
@@ -284,8 +201,11 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // good idea.
+      // TODO(skal): several things to correct here:
+      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // * we're actually decompressing the full plane only once. It should be
+      //   more obvious from signature.
+      // * we could free alpha_data_ right after this call, but we don't own.
      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -317,8 +237,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
    }
  }
  // rotate top samples if needed
-  if (cache_id + 1 == dec->num_caches_) {
-    if (!is_last_row) {
+  if (ctx->id_ + 1 == dec->num_caches_) {
+    if (!last_row) {
      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@@ -335,14 +255,10 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  const int filter_row =
-      (dec->filter_type_ > 0) &&
-      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
-  if (dec->mt_method_ == 0) {
+  if (!dec->use_threads_) {
    // ctx->id_ and ctx->f_info_ are already set
    ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = filter_row;
-    ReconstructRow(dec, ctx);
+    ctx->filter_row_ = dec->filter_row_;
    ok = FinishRow(dec, io);
  } else {
    WebPWorker* const worker = &dec->worker_;
@@ -353,21 +269,13 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
      ctx->io_ = *io;
      ctx->id_ = dec->cache_id_;
      ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = filter_row;
-      if (dec->mt_method_ == 2) {  // swap macroblock data
-        VP8MBData* const tmp = ctx->mb_data_;
-        ctx->mb_data_ = dec->mb_data_;
-        dec->mb_data_ = tmp;
-      } else {
-        // perform reconstruction directly in main thread
-        ReconstructRow(dec, ctx);
-      }
-      if (filter_row) {            // swap filter info
+      ctx->filter_row_ = dec->filter_row_;
+      if (ctx->filter_row_) {    // just swap filter info
        VP8FInfo* const tmp = ctx->f_info_;
        ctx->f_info_ = dec->f_info_;
        dec->f_info_ = tmp;
      }
-      WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
+      WebPWorkerLaunch(worker);
      if (++dec->cache_id_ == dec->num_caches_) {
        dec->cache_id_ = 0;
      }
@@ -381,8 +289,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {

 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() no matter what.
-  if (io->setup != NULL && !io->setup(io)) {
+  // Note: Afterward, we must call teardown() not matter what.
+  if (io->setup && !io->setup(io)) {
    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
    return dec->status_;
  }
@@ -395,7 +303,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {

  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
-  // 'Simple' filter reads two luma samples outside of the macroblock
+  // 'Simple' filter reads two luma samples outside of the macroblock and
  // and filters one. It doesn't filter the chroma samples. Hence, we can
  // avoid doing the in-loop filtering before crop_top/crop_left position.
  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@@ -436,11 +344,11 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {

 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
    ok = WebPWorkerSync(&dec->worker_);
  }

-  if (io->teardown != NULL) {
+  if (io->teardown) {
    io->teardown(io);
  }
  return ok;
@@ -476,7 +384,7 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
  dec->cache_id_ = 0;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
    WebPWorker* const worker = &dec->worker_;
    if (!WebPWorkerReset(worker)) {
      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
@@ -493,28 +401,6 @@ static int InitThreadContext(VP8Decoder* const dec) {
  return 1;
 }

-int VP8GetThreadMethod(const WebPDecoderOptions* const options,
-                       const WebPHeaderStructure* const headers,
-                       int width, int height) {
-  if (options == NULL || options->use_threads == 0) {
-    return 0;
-  }
-  (void)headers;
-  (void)width;
-  (void)height;
-  assert(!headers->is_lossless);
-#if defined(WEBP_USE_THREAD)
-  if (width < MIN_WIDTH_FOR_THREADS) return 0;
-  // TODO(skal): tune the heuristic further
-#if 0
-  if (height < 2 * width) return 2;
-#endif
-  return 2;
-#else   // !WEBP_USE_THREAD
-  return 0;
-#endif
-}
-
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES

@@ -526,15 +412,14 @@ static int AllocateMemory(VP8Decoder* const dec) {
  const int mb_w = dec->mb_w_;
  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
+  const size_t top_size = (16 + 8 + 8) * mb_w;
  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
  const size_t f_info_size =
      (dec->filter_type_ > 0) ?
-          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
        : 0;
  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t mb_data_size =
-      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
+  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
  const size_t cache_height = (16 * num_caches
                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const size_t cache_size = top_size * cache_height;
@@ -543,7 +428,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
-                        + yuv_size + mb_data_size
+                        + yuv_size + coeffs_size
                        + cache_size + alpha_size + ALIGN_MASK;
  uint8_t* mem;

@@ -564,8 +449,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->intra_t_ = (uint8_t*)mem;
  mem += intra_pred_mode_size;

-  dec->yuv_t_ = (VP8TopSamples*)mem;
-  mem += top_size;
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;

  dec->mb_info_ = ((VP8MB*)mem) + 1;
  mem += mb_info_size;
@@ -574,7 +463,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += f_info_size;
  dec->thread_ctx_.id_ = 0;
  dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
    // secondary cache line. The deblocking process need to make use of the
    // filtering strength from previous macroblock row, while the new ones
    // are being decoded in parallel. We'll just swap the pointers.
@@ -586,12 +475,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;

-  dec->mb_data_ = (VP8MBData*)mem;
-  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
-  if (dec->mt_method_ == 2) {
-    dec->thread_ctx_.mb_data_ += mb_w;
-  }
-  mem += mb_data_size;
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;

  dec->cache_y_stride_ = 16 * mb_w;
  dec->cache_uv_stride_ = 8 * mb_w;
@@ -613,9 +498,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);

-  // note: left/top-info is initialized once for all.
+  // note: left-info is initialized once for all.
  memset(dec->mb_info_ - 1, 0, mb_info_size);
-  VP8InitScanline(dec);   // initialize left too.

  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@@ -652,167 +536,159 @@ static const int kScan[16] = {
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };

-static int CheckMode(int mb_x, int mb_y, int mode) {
+static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
  if (mode == B_DC_PRED) {
-    if (mb_x == 0) {
-      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    if (dec->mb_x_ == 0) {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
    } else {
-      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
    }
  }
  return mode;
 }

-static void Copy32b(uint8_t* dst, uint8_t* src) {
-  memcpy(dst, src, 4);
+static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
+  *(uint32_t*)dst = *(uint32_t*)src;
 }

-static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
-                                    uint8_t* const dst) {
-  switch (bits >> 30) {
-    case 3:
-      VP8Transform(src, dst, 0);
-      break;
-    case 2:
-      VP8TransformAC3(src, dst);
-      break;
-    case 1:
-      VP8TransformDC(src, dst);
-      break;
-    default:
-      break;
-  }
-}
-
-static void DoUVTransform(uint32_t bits, const int16_t* const src,
-                          uint8_t* const dst) {
-  if (bits & 0xff) {    // any non-zero coeff at all?
-    if (bits & 0xaa) {  // any non-zero AC coefficient?
-      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
-    } else {
-      VP8TransformDCUV(src, dst);
-    }
-  }
-}
-
-static void ReconstructRow(const VP8Decoder* const dec,
-                           const VP8ThreadContext* ctx) {
+void VP8ReconstructBlock(VP8Decoder* const dec) {
  int j;
-  int mb_x;
-  const int mb_y = ctx->mb_y_;
-  const int cache_id = ctx->id_;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
-  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-    const VP8MBData* const block = ctx->mb_data_ + mb_x;

-    // Rotate in the left samples from previously decoded block. We move four
-    // pixels at a time for alignment reason, and because of in-loop filter.
-    if (mb_x > 0) {
-      for (j = -1; j < 16; ++j) {
-        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+  // Rotate in the left samples from previously decoded block. We move four
+  // pixels at a time for alignment reason, and because of in-loop filter.
+  if (dec->mb_x_ > 0) {
+    for (j = -1; j < 16; ++j) {
+      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+    }
+    for (j = -1; j < 8; ++j) {
+      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+    }
+  } else {
+    for (j = 0; j < 16; ++j) {
+      y_dst[j * BPS - 1] = 129;
+    }
+    for (j = 0; j < 8; ++j) {
+      u_dst[j * BPS - 1] = 129;
+      v_dst[j * BPS - 1] = 129;
+    }
+    // Init top-left sample on left column too
+    if (dec->mb_y_ > 0) {
+      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+    }
+  }
+  {
+    // bring top samples into the cache
+    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
+    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
+    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
+    const int16_t* coeffs = dec->coeffs_;
+    int n;
+
+    if (dec->mb_y_ > 0) {
+      memcpy(y_dst - BPS, top_y, 16);
+      memcpy(u_dst - BPS, top_u, 8);
+      memcpy(v_dst - BPS, top_v, 8);
+    } else if (dec->mb_x_ == 0) {
+      // we only need to do this init once at block (0,0).
+      // Afterward, it remains valid for the whole topmost row.
+      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+      memset(u_dst - BPS - 1, 127, 8 + 1);
+      memset(v_dst - BPS - 1, 127, 8 + 1);
+    }
+
+    // predict and add residuals
+
+    if (dec->is_i4x4_) {   // 4x4
+      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+      if (dec->mb_y_ > 0) {
+        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
+          top_right[0] = top_y[15] * 0x01010101u;
+        } else {
+          memcpy(top_right, top_y + 16, sizeof(*top_right));
+        }
      }
-      for (j = -1; j < 8; ++j) {
-        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      // replicate the top-right pixels below
+      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+      // predict and add residues for all 4x4 blocks in turn.
+      for (n = 0; n < 16; n++) {
+        uint8_t* const dst = y_dst + kScan[n];
+        VP8PredLuma4[dec->imodes_[n]](dst);
+        if (dec->non_zero_ac_ & (1 << n)) {
+          VP8Transform(coeffs + n * 16, dst, 0);
+        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+          VP8TransformDC(coeffs + n * 16, dst);
+        }
      }
-    } else {
-      for (j = 0; j < 16; ++j) {
-        y_dst[j * BPS - 1] = 129;
-      }
-      for (j = 0; j < 8; ++j) {
-        u_dst[j * BPS - 1] = 129;
-        v_dst[j * BPS - 1] = 129;
-      }
-      // Init top-left sample on left column too
-      if (mb_y > 0) {
-        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+    } else {    // 16x16
+      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+      VP8PredLuma16[pred_func](y_dst);
+      if (dec->non_zero_) {
+        for (n = 0; n < 16; n++) {
+          uint8_t* const dst = y_dst + kScan[n];
+          if (dec->non_zero_ac_ & (1 << n)) {
+            VP8Transform(coeffs + n * 16, dst, 0);
+          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+            VP8TransformDC(coeffs + n * 16, dst);
+          }
+        }
      }
    }
    {
-      // bring top samples into the cache
-      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
-      const int16_t* const coeffs = block->coeffs_;
-      uint32_t bits = block->non_zero_y_;
-      int n;
+      // Chroma
+      const int pred_func = CheckMode(dec, dec->uvmode_);
+      VP8PredChroma8[pred_func](u_dst);
+      VP8PredChroma8[pred_func](v_dst);

-      if (mb_y > 0) {
-        memcpy(y_dst - BPS, top_yuv[0].y, 16);
-        memcpy(u_dst - BPS, top_yuv[0].u, 8);
-        memcpy(v_dst - BPS, top_yuv[0].v, 8);
-      } else if (mb_x == 0) {
-        // we only need to do this init once at block (0,0).
-        // Afterward, it remains valid for the whole topmost row.
-        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-        memset(u_dst - BPS - 1, 127, 8 + 1);
-        memset(v_dst - BPS - 1, 127, 8 + 1);
-      }
-
-      // predict and add residuals
-      if (block->is_i4x4_) {   // 4x4
-        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
-
-        if (mb_y > 0) {
-          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
-            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
-          } else {
-            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
-          }
-        }
-        // replicate the top-right pixels below
-        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-        // predict and add residuals for all 4x4 blocks in turn.
-        for (n = 0; n < 16; ++n, bits <<= 2) {
-          uint8_t* const dst = y_dst + kScan[n];
-          VP8PredLuma4[block->imodes_[n]](dst);
-          DoTransform(bits, coeffs + n * 16, dst);
-        }
-      } else {    // 16x16
-        const int pred_func = CheckMode(mb_x, mb_y,
-                                        block->imodes_[0]);
-        VP8PredLuma16[pred_func](y_dst);
-        if (bits != 0) {
-          for (n = 0; n < 16; ++n, bits <<= 2) {
-            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
-          }
+      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
+        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
+        if (dec->non_zero_ac_ & 0x0f0000) {
+          VP8TransformUV(u_coeffs, u_dst);
+        } else {
+          VP8TransformDCUV(u_coeffs, u_dst);
        }
      }
-      {
-        // Chroma
-        const uint32_t bits_uv = block->non_zero_uv_;
-        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
-        VP8PredChroma8[pred_func](u_dst);
-        VP8PredChroma8[pred_func](v_dst);
-        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
-        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      if (dec->non_zero_ & 0xf00000) {   // chroma-V
+        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
+        if (dec->non_zero_ac_ & 0xf00000) {
+          VP8TransformUV(v_coeffs, v_dst);
+        } else {
+          VP8TransformDCUV(v_coeffs, v_dst);
+        }
      }

      // stash away top samples for next block
-      if (mb_y < dec->mb_h_ - 1) {
-        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
-        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
-        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      if (dec->mb_y_ < dec->mb_h_ - 1) {
+        memcpy(top_y, y_dst + 15 * BPS, 16);
+        memcpy(top_u, u_dst +  7 * BPS,  8);
+        memcpy(top_v, v_dst +  7 * BPS,  8);
      }
    }
-    // Transfer reconstructed samples from yuv_b_ cache to final destination.
-    {
-      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
-      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
-      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
-      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
-      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
-      for (j = 0; j < 16; ++j) {
-        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
-      }
-      for (j = 0; j < 8; ++j) {
-        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
-      }
+  }
+  // Transfer reconstructed samples from yuv_b_ cache to final destination.
+  {
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
+    for (j = 0; j < 16; ++j) {
+      memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+    }
+    for (j = 0; j < 8; ++j) {
+      memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+      memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
    }
  }
 }

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -15,11 +15,14 @@
 #include <string.h>
 #include <stdlib.h>

-#include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
 #define CHUNK_SIZE 4096
@@ -28,13 +31,11 @@
 //------------------------------------------------------------------------------
 // Data structures for memory and states

-// Decoding states. State normally flows as:
-// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
-// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
+// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
-  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
+  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
  STATE_VP8_PARTS0,
  STATE_VP8_DATA,
  STATE_VP8L_HEADER,
@@ -101,7 +102,7 @@ static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
 // Check if we need to preserve the compressed alpha data, as it may not have
 // been decoded yet.
 static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
-  if (idec->state_ == STATE_WEBP_HEADER) {
+  if (idec->state_ == STATE_PRE_VP8) {
    // We haven't parsed the headers yet, so we don't know whether the image is
    // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
    return 0;
@@ -110,7 +111,7 @@ static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
    return 0;  // ALPH chunk is not present for lossless images.
  } else {
    const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
-    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
+    assert(dec != NULL);  // Must be true as idec->state_ != STATE_PRE_VP8.
    return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
  }
 }
@@ -140,22 +141,7 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
      }
      assert(last_part >= 0);
      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
-      if (NeedCompressedAlpha(idec)) {
-        ALPHDecoder* const alph_dec = dec->alph_dec_;
-        dec->alpha_data_ += offset;
-        if (alph_dec != NULL) {
-          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
-            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
-            assert(alph_vp8l_dec != NULL);
-            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
-            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
-                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
-                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
-          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
-            // Nothing special to do in this case.
-          }
-        }
-      }
+      if (NeedCompressedAlpha(idec)) dec->alpha_data_ += offset;
    } else {    // Resize lossless bitreader
      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@@ -282,7 +268,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
  if (idec->state_ == STATE_VP8_DATA) {
    VP8Io* const io = &idec->io_;
-    if (io->teardown != NULL) {
+    if (io->teardown) {
      io->teardown(io);
    }
  }
@@ -325,9 +311,15 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    idec->dec_ = dec;
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = (idec->params_.options != NULL) &&
+                        (idec->params_.options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
  } else {
    VP8LDecoder* const dec = VP8LNew();
    if (dec == NULL) {
@@ -342,14 +334,13 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
  const size_t curr_size = MemDataSize(&idec->mem_);
-  int width, height;
  uint32_t bits;

  if (curr_size < VP8_FRAME_HEADER_SIZE) {
    // Not enough data bytes to extract VP8 Frame Header.
    return VP8_STATUS_SUSPENDED;
  }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
  }

@@ -416,10 +407,7 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  if (dec->status_ != VP8_STATUS_OK) {
    return IDecError(idec, dec->status_);
  }
-  // This change must be done before calling VP8InitFrame()
-  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
-                                       io->width, io->height);
-  VP8InitDithering(params->options, dec);
+
  if (!CopyParts0Data(idec)) {
    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
  }
@@ -445,11 +433,16 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  VP8Io* const io = &idec->io_;

  assert(dec->ready_);
+
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+    if (dec->mb_x_ == 0) {
+      VP8InitScanline(dec);
+    }
+    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
      MBContext context;
      SaveContext(dec, token_br, &context);
+
      if (!VP8DecodeMB(dec, token_br)) {
        RestoreContext(&context, dec, token_br);
        // We shouldn't fail when MAX_MB data was available
@@ -458,18 +451,19 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
        }
        return VP8_STATUS_SUSPENDED;
      }
+      // Reconstruct and emit samples.
+      VP8ReconstructBlock(dec);
+
      // Release buffer only if there is only one partition
      if (dec->num_parts_ == 1) {
        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
    }
-    VP8InitScanline(dec);   // Prepare for next scanline
-
-    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return IDecError(idec, VP8_STATUS_USER_ABORT);
    }
+    dec->mb_x_ = 0;
  }
  // Synchronize the thread and check for errors.
  if (!VP8ExitCritical(dec, io)) {
@@ -481,8 +475,7 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  return VP8_STATUS_OK;
 }

-static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
-                                         VP8StatusCode status) {
+static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    return VP8_STATUS_SUSPENDED;
  }
@@ -539,14 +532,14 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
  VP8StatusCode status = VP8_STATUS_SUSPENDED;

-  if (idec->state_ == STATE_WEBP_HEADER) {
+  if (idec->state_ == STATE_PRE_VP8) {
    status = DecodeWebPHeaders(idec);
  } else {
    if (idec->dec_ == NULL) {
      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
    }
  }
-  if (idec->state_ == STATE_VP8_HEADER) {
+  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
    status = DecodeVP8FrameHeader(idec);
  }
  if (idec->state_ == STATE_VP8_PARTS0) {
@@ -573,7 +566,7 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
    return NULL;
  }

-  idec->state_ = STATE_WEBP_HEADER;
+  idec->state_ = STATE_PRE_VP8;
  idec->chunk_size_ = 0;

  InitMemBuffer(&idec->mem_);
@@ -581,8 +574,7 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
  VP8InitIo(&idec->io_);

  WebPResetDecParams(&idec->params_);
-  idec->params_.output = (output_buffer != NULL) ? output_buffer
-                                                 : &idec->output_;
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.

  return idec;
@@ -616,11 +608,11 @@ void WebPIDelete(WebPIDecoder* idec) {
    if (!idec->is_lossless_) {
      if (idec->state_ == STATE_VP8_DATA) {
        // Synchronize the thread, clean-up and check for errors.
-        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
+        VP8ExitCritical(idec->dec_, &idec->io_);
      }
-      VP8Delete((VP8Decoder*)idec->dec_);
+      VP8Delete(idec->dec_);
    } else {
-      VP8LDelete((VP8LDecoder*)idec->dec_);
+      VP8LDelete(idec->dec_);
    }
  }
  ClearMemBuffer(&idec->mem_);
@@ -835,7 +827,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                    VP8IoSetupHook setup,
                    VP8IoTeardownHook teardown,
                    void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
+  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
    return 0;
  }

@@ -847,3 +839,6 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
  return 1;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -18,6 +18,10 @@
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions

@@ -115,7 +119,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {

  if (y == 0) {
    // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
  } else {
    // We can finish the left-over line from previous call.
    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@@ -599,7 +603,7 @@ static int CustomPut(const VP8Io* io) {
    return 0;
  }
  num_lines_out = p->emit(io, p);
-  if (p->emit_alpha != NULL) {
+  if (p->emit_alpha) {
    p->emit_alpha(io, p);
  }
  p->last_y += num_lines_out;
@@ -626,3 +630,6 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@@ -16,6 +16,10 @@

 #include "./vp8i.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------

 int VP8DecodeLayer(VP8Decoder* const dec) {
@@ -28,3 +32,6 @@ int VP8DecodeLayer(VP8Decoder* const dec) {
  return 1;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@@ -13,6 +13,10 @@

 #include "./vp8i.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }
@@ -100,11 +104,12 @@ void VP8ParseQuant(VP8Decoder* const dec) {

      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
-
-      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@@ -15,6 +15,10 @@

 #define USE_GENERIC_TREE

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
@@ -29,12 +33,61 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif

+#ifndef ONLY_KEYFRAME_CODE
+
+// inter prediction modes
+enum {
+  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
+  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
+
+static const int8_t kYModesInter[8] = {
+  -DC_PRED, 1,
+    2, 3,
+      -V_PRED, -H_PRED,
+      -TM_PRED, -B_PRED
+};
+
+static const int8_t kMBSplit[6] = {
+  -3, 1,
+    -2, 2,
+      -0, -1
+};
+
+static const int8_t kMVRef[8] = {
+  -ZEROMV, 1,
+    -NEARESTMV, 2,
+      -NEARMV, 3,
+        -NEWMV, -SPLITMV
+};
+
+static const int8_t kMVRef4[6] = {
+  -LEFT4, 1,
+    -ABOVE4, 2,
+      -ZERO4, -NEW4
+};
+#endif
+
 //------------------------------------------------------------------------------
 // Default probabilities

+// Inter
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
+static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
+static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
+  { 162, 128, 225, 146, 172, 147, 214,  39,
+    156, 128, 129, 132,  75, 145, 178, 206,
+    239, 254, 254 },
+  { 164, 128, 204, 170, 119, 235, 140, 230,
+    228, 128, 130, 130,  74, 148, 180, 203,
+    236, 254, 254 }
+};
+#endif
+
 // Paragraph 13.5
 static const uint8_t
  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -275,25 +328,28 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {

 void VP8ResetProba(VP8Proba* const proba) {
  memset(proba->segments_, 255u, sizeof(proba->segments_));
-  // proba->bands_[][] is initialized later
+  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
+#ifndef ONLY_KEYFRAME_CODE
+  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
+  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
+  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
+#endif
 }

-void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
  uint8_t* const left = dec->intra_l_;
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-
-  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!block->is_i4x4_) {
-    // Hardcoded 16x16 intra-mode decision tree.
+  // Hardcoded 16x16 intra-mode decision tree.
+  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!dec->is_i4x4_) {
    const int ymode =
        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    block->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(*top));
-    memset(left, ymode, 4 * sizeof(*left));
+    dec->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(top[0]));
+    memset(left, ymode, 4 * sizeof(left[0]));
  } else {
-    uint8_t* modes = block->imodes_;
+    uint8_t* modes = dec->imodes_;
    int y;
    for (y = 0; y < 4; ++y) {
      int ymode = left[y];
@@ -302,10 +358,10 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
        // Generic tree-parsing
-        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
-        while (i > 0) {
+        int i = 0;
+        do {
          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        }
+        } while (i > 0);
        ymode = -i;
 #else
        // Hardcoded tree parsing
@@ -320,16 +376,15 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
        top[x] = ymode;
+        *modes++ = ymode;
      }
-      memcpy(modes, top, 4 * sizeof(*top));
-      modes += 4;
      left[y] = ymode;
    }
  }
  // Hardcoded UVMode decision tree
-  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-                 : !VP8GetBit(br, 114) ? V_PRED
-                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+               : !VP8GetBit(br, 114) ? V_PRED
+               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }

 //------------------------------------------------------------------------------
@@ -471,6 +526,17 @@ static const uint8_t
  }
 };

+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
+  { 237, 246, 253, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 250, 250,
+    252, 254, 254 },
+  { 231, 243, 245, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 251, 251,
+    254, 254, 254 }
+};
+#endif
+
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
@@ -479,9 +545,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
-          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
-                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
-          proba->bands_[t][b].probas_[c][p] = v;
+          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
+            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
+          }
        }
      }
    }
@@ -490,5 +556,36 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  if (dec->use_skip_proba_) {
    dec->skip_p_ = VP8GetValue(br, 8);
  }
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->frm_hdr_.key_frame_) {
+    int i;
+    dec->intra_p_ = VP8GetValue(br, 8);
+    dec->last_p_ = VP8GetValue(br, 8);
+    dec->golden_p_ = VP8GetValue(br, 8);
+    if (VP8Get(br)) {   // update y-mode
+      for (i = 0; i < 4; ++i) {
+        proba->ymode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    if (VP8Get(br)) {   // update uv-mode
+      for (i = 0; i < 3; ++i) {
+        proba->uvmode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    // update MV
+    for (i = 0; i < 2; ++i) {
+      int k;
+      for (k = 0; k < NUM_MV_PROBAS; ++k) {
+        if (VP8GetBit(br, MVUpdateProba[i][k])) {
+          const int v = VP8GetValue(br, 7);
+          proba->mv_[i][k] = v ? v << 1 : 1;
+        }
+      }
+    }
+  }
+#endif
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -13,12 +13,15 @@

 #include <stdlib.h>

-#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 #include "../utils/bit_reader.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------

 int WebPGetDecoderVersion(void) {
@@ -120,9 +123,6 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
    if (((bits >> 5)) >= chunk_size) {  // partition_length
      return 0;         // inconsistent size information.
    }
-    if (w == 0 || h == 0) {
-      return 0;         // We don't support both width and height to be zero.
-    }

    if (width) {
      *width = w;
@@ -249,6 +249,7 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
  VP8StatusCode status;
+  WebPHeaderStructure headers;

  if (dec == NULL) {
    return 0;
@@ -258,8 +259,33 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "null VP8Io passed to VP8GetHeaders()");
  }
-  buf = io->data;
-  buf_size = io->data_size;
+
+  // Process Pre-VP8 chunks.
+  headers.data = io->data;
+  headers.data_size = io->data_size;
+  status = WebPParseHeaders(&headers);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "Incorrect/incomplete header.");
+  }
+  if (headers.is_lossless) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "Unexpected lossless format encountered.");
+  }
+
+  if (dec->alpha_data_ == NULL) {
+    assert(dec->alpha_data_size_ == 0);
+    // We have NOT set alpha data yet. Set it now.
+    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
+    // WebPParseHeaders() is called more than once, as in incremental decoding
+    // case.)
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+  }
+
+  // Process the VP8 frame header.
+  buf = headers.data + headers.offset;
+  buf_size = headers.data_size - headers.offset;
+  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "Truncated header.");
@@ -355,11 +381,38 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

  // Frame buffer marking
  if (!frm_hdr->key_frame_) {
+    // Paragraph 9.7
+#ifndef ONLY_KEYFRAME_CODE
+    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
+    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
+    if (!(dec->buffer_flags_ & 1)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
+    }
+    if (!(dec->buffer_flags_ & 2)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
+    }
+    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
+    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
+#else
    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                       "Not a key frame.");
+#endif
+  } else {
+    dec->buffer_flags_ = 0x003 | 0x100;
  }

-  VP8Get(br);   // ignore the value of update_proba_
+  // Paragraph 9.8
+#ifndef ONLY_KEYFRAME_CODE
+  dec->update_proba_ = VP8Get(br);
+  if (!dec->update_proba_) {    // save for later restore
+    dec->proba_saved_ = dec->proba_;
+  }
+  dec->buffer_flags_ &= 1 << 8;
+  dec->buffer_flags_ |=
+      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
+#else
+  VP8Get(br);   // just ignore the value of update_proba_
+#endif

  VP8ParseProba(br, dec);

@@ -408,6 +461,9 @@ static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };

+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+typedef const uint8_t (*ProbaCtxArray)[NUM_PROBAS];
+
 // See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
  int v;
@@ -441,20 +497,19 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }

 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
+// (and 0 if there's no coeff at all)
+static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
                     int ctx, const quant_t dq, int n, int16_t* out) {
  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n].probas_[ctx];
+  const uint8_t* p = prob[n][ctx];
+  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
+    return 0;
+  }
  for (; n < 16; ++n) {
-    if (!VP8GetBit(br, p[0])) {
-      return n;  // previous coeff was last non-zero coeff
-    }
-    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
-      p = prob[kBands[++n]].probas_[0];
-      if (n == 16) return 16;
-    }
-    {        // non zero coeff
-      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
+    const ProbaCtxArray p_ctx = prob[kBands[n + 1]];
+    if (!VP8GetBit(br, p[1])) {
+      p = p_ctx[0];
+    } else {  // non zero coeff
      int v;
      if (!VP8GetBit(br, p[2])) {
        v = 1;
@@ -464,107 +519,115 @@ static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
        p = p_ctx[2];
      }
      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      if (n < 15 && !VP8GetBit(br, p[0])) {   // EOB
+        return n + 1;
+      }
    }
  }
  return 16;
 }

-static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
-  nz_coeffs <<= 2;
-  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
-  return nz_coeffs;
-}
+// Alias-safe way of converting 4bytes to 32bits.
+typedef union {
+  uint8_t  i8[4];
+  uint32_t i32;
+} PackedNz;

-static int ParseResiduals(VP8Decoder* const dec,
-                          VP8MB* const mb, VP8BitReader* const token_br) {
-  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
-  const VP8BandProbas* ac_proba;
-  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int16_t* dst = block->coeffs_;
+// Table to unpack four bits into four bytes
+static const PackedNz kUnpackTab[16] = {
+  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
+  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
+  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
+  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
+
+// Macro to pack four LSB of four bytes into four bits.
+#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
+    defined(__BIG_ENDIAN__)
+#define PACK_CST 0x08040201U
+#else
+#define PACK_CST 0x01020408U
+#endif
+#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
+
+static void ParseResiduals(VP8Decoder* const dec,
+                           VP8MB* const mb, VP8BitReader* const token_br) {
+  int out_t_nz, out_l_nz, first;
+  ProbaArray ac_prob;
+  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
+  int16_t* dst = dec->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
-  uint8_t tnz, lnz;
-  uint32_t non_zero_y = 0;
-  uint32_t non_zero_uv = 0;
+  PackedNz nz_ac, nz_dc;
+  PackedNz tnz, lnz;
+  uint32_t non_zero_ac = 0;
+  uint32_t non_zero_dc = 0;
  int x, y, ch;
-  uint32_t out_t_nz, out_l_nz;
-  int first;

+  nz_dc.i32 = nz_ac.i32 = 0;
  memset(dst, 0, 384 * sizeof(*dst));
-  if (!block->is_i4x4_) {    // parse DC
+  if (!dec->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
-    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
-    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
-    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
-    if (nz > 1) {   // more than just the DC -> perform the full transform
-      VP8TransformWHT(dc, dst);
-    } else {        // only DC is non-zero -> inlined simplified transform
-      int i;
-      const int dc0 = (dc[0] + 3) >> 3;
-      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
-    }
+    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
+    mb->dc_nz_ = left_mb->dc_nz_ =
+        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+                   ctx, q->y2_mat_, 0, dc) > 0);
    first = 1;
-    ac_proba = bands[0];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
+    VP8TransformWHT(dc, dst);
  } else {
    first = 0;
-    ac_proba = bands[3];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
  }

-  tnz = mb->nz_ & 0x0f;
-  lnz = left_mb->nz_ & 0x0f;
+  tnz = kUnpackTab[mb->nz_ & 0xf];
+  lnz = kUnpackTab[left_mb->nz_ & 0xf];
  for (y = 0; y < 4; ++y) {
-    int l = lnz & 1;
-    uint32_t nz_coeffs = 0;
+    int l = lnz.i8[y];
    for (x = 0; x < 4; ++x) {
-      const int ctx = l + (tnz & 1);
-      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
-      l = (nz > first);
-      tnz = (tnz >> 1) | (l << 7);
-      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+      const int ctx = l + tnz.i8[x];
+      const int nz = GetCoeffs(token_br, ac_prob, ctx,
+                               q->y1_mat_, first, dst);
+      tnz.i8[x] = l = (nz > 0);
+      nz_dc.i8[x] = (dst[0] != 0);
+      nz_ac.i8[x] = (nz > 1);
      dst += 16;
    }
-    tnz >>= 4;
-    lnz = (lnz >> 1) | (l << 7);
-    non_zero_y = (non_zero_y << 8) | nz_coeffs;
+    lnz.i8[y] = l;
+    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
+    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
  }
-  out_t_nz = tnz;
-  out_l_nz = lnz >> 4;
+  out_t_nz = PACK(tnz, 24);
+  out_l_nz = PACK(lnz, 24);

+  tnz = kUnpackTab[mb->nz_ >> 4];
+  lnz = kUnpackTab[left_mb->nz_ >> 4];
  for (ch = 0; ch < 4; ch += 2) {
-    uint32_t nz_coeffs = 0;
-    tnz = mb->nz_ >> (4 + ch);
-    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
-      int l = lnz & 1;
+      int l = lnz.i8[ch + y];
      for (x = 0; x < 2; ++x) {
-        const int ctx = l + (tnz & 1);
-        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
-        l = (nz > 0);
-        tnz = (tnz >> 1) | (l << 3);
-        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+        const int ctx = l + tnz.i8[ch + x];
+        const int nz =
+            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
+                      ctx, q->uv_mat_, 0, dst);
+        tnz.i8[ch + x] = l = (nz > 0);
+        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
+        nz_ac.i8[y * 2 + x] = (nz > 1);
        dst += 16;
      }
-      tnz >>= 2;
-      lnz = (lnz >> 1) | (l << 5);
+      lnz.i8[ch + y] = l;
    }
-    // Note: we don't really need the per-4x4 details for U/V blocks.
-    non_zero_uv |= nz_coeffs << (4 * ch);
-    out_t_nz |= (tnz << 4) << ch;
-    out_l_nz |= (lnz & 0xf0) << ch;
+    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
+    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
  }
+  out_t_nz |= PACK(tnz, 20);
+  out_l_nz |= PACK(lnz, 20);
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;

-  block->non_zero_y_ = non_zero_y;
-  block->non_zero_uv_ = non_zero_uv;
-
-  // We look at the mode-code of each block and check if some blocks have less
-  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
-  // empty blocks.
-  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
-
-  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
+  dec->non_zero_ac_ = non_zero_ac;
+  dec->non_zero_ = non_zero_ac | non_zero_dc;
+  mb->skip_ = !dec->non_zero_;
 }
+#undef PACK

 //------------------------------------------------------------------------------
 // Main loop
@@ -572,9 +635,7 @@ static int ParseResiduals(VP8Decoder* const dec,
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  VP8BitReader* const br = &dec->br_;
  VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
-  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
-  int skip;
+  VP8MB* const info = dec->mb_info_ + dec->mb_x_;

  // Note: we don't save segment map (yet), as we don't expect
  // to decode more than 1 keyframe.
@@ -584,64 +645,71 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
        VP8GetBit(br, dec->proba_.segments_[1]) :
        2 + VP8GetBit(br, dec->proba_.segments_[2]);
  }
-  skip = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;

  VP8ParseIntraMode(br, dec);
  if (br->eof_) {
    return 0;
  }

-  if (!skip) {
-    skip = ParseResiduals(dec, mb, token_br);
+  if (!info->skip_) {
+    ParseResiduals(dec, info, token_br);
  } else {
-    left->nz_ = mb->nz_ = 0;
-    if (!block->is_i4x4_) {
-      left->nz_dc_ = mb->nz_dc_ = 0;
+    left->nz_ = info->nz_ = 0;
+    if (!dec->is_i4x4_) {
+      left->dc_nz_ = info->dc_nz_ = 0;
    }
-    block->non_zero_y_ = 0;
-    block->non_zero_uv_ = 0;
+    dec->non_zero_ = 0;
+    dec->non_zero_ac_ = 0;
  }

  if (dec->filter_type_ > 0) {  // store filter info
    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
-    *finfo = dec->fstrengths_[dec->segment_][block->is_i4x4_];
-    finfo->f_inner_ |= !skip;
+    *finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
+    finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
  }

-  return !token_br->eof_;
+  return (!token_br->eof_);
 }

 void VP8InitScanline(VP8Decoder* const dec) {
  VP8MB* const left = dec->mb_info_ - 1;
  left->nz_ = 0;
-  left->nz_dc_ = 0;
+  left->dc_nz_ = 0;
  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->mb_x_ = 0;
+  dec->filter_row_ =
+    (dec->filter_type_ > 0) &&
+    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
 }

 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
-    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+    VP8InitScanline(dec);
+    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
+      // Reconstruct and emit samples.
+      VP8ReconstructBlock(dec);
    }
-    VP8InitScanline(dec);   // Prepare for next scanline
-
-    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
-  if (dec->mt_method_ > 0) {
-    if (!WebPWorkerSync(&dec->worker_)) return 0;
+  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+    return 0;
  }

  // Finish
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->update_proba_) {
+    dec->proba_ = dec->proba_saved_;
+  }
+#endif
+
 #ifdef WEBP_EXPERIMENTAL_FEATURES
  if (dec->layer_data_size_ > 0) {
    if (!VP8DecodeLayer(dec)) {
@@ -697,12 +765,12 @@ void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
-  if (dec->mt_method_ > 0) {
+  if (dec->use_threads_) {
    WebPWorkerEnd(&dec->worker_);
  }
-  ALPHDelete(dec->alph_dec_);
-  dec->alph_dec_ = NULL;
-  free(dec->mem_);
+  if (dec->mem_) {
+    free(dec->mem_);
+  }
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
  memset(&dec->br_, 0, sizeof(dec->br_));
@@ -711,3 +779,6 @@ void VP8Clear(VP8Decoder* const dec) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -17,11 +17,10 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
-#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -30,8 +29,10 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 4
-#define DEC_REV_VERSION 0
+#define DEC_MIN_VERSION 3
+#define DEC_REV_VERSION 1
+
+#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -99,9 +100,6 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)

-// minimal width under which lossy multi-threading is always disabled
-#define MIN_WIDTH_FOR_THREADS 512
-
 //------------------------------------------------------------------------------
 // Headers

@@ -130,19 +128,15 @@ typedef struct {
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;

-
-// probas associated to one of the contexts
-typedef uint8_t VP8ProbaArray[NUM_PROBAS];
-
-typedef struct {   // all the probas associated to one band
-  VP8ProbaArray probas_[NUM_CTX];
-} VP8BandProbas;
-
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t ymode_[4], uvmode_[3];
+  uint8_t mv_[2][NUM_MV_PROBAS];
+#endif
 } VP8Proba;

 // Filter parameters
@@ -159,59 +153,32 @@ typedef struct {
 // Informations about the macroblocks.

 typedef struct {  // filter specs
-  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
-  uint8_t f_ilevel_;     // inner limit in [1..63]
-  uint8_t f_inner_;      // do inner filtering?
-  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
+  unsigned int f_level_:6;      // filter strength: 0..63
+  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  unsigned int f_inner_:1;      // do inner filtering?
 } VP8FInfo;

-typedef struct {  // Top/Left Contexts used for syntax-parsing
-  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
-  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
+typedef struct {  // used for syntax-parsing
+  unsigned int nz_:24;       // non-zero AC/DC coeffs (24bit)
+  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  unsigned int skip_:1;      // block type
 } VP8MB;

 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
  quant_t y1_mat_, y2_mat_, uv_mat_;
-
-  int uv_quant_;   // U/V quantizer value
-  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;

-// Data needed to reconstruct a macroblock
-typedef struct {
-  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
-  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
-  //   code=0 -> no coefficient
-  //   code=1 -> only DC
-  //   code=2 -> first three coefficients are non-zero
-  //   code=3 -> more than three coefficients are non-zero
-  // This allows to call specialized transform functions.
-  uint32_t non_zero_y_;
-  uint32_t non_zero_uv_;
-  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
-} VP8MBData;
-
 // Persistent information needed by the parallel processing
 typedef struct {
-  int id_;              // cache row to process (in [0..2])
-  int mb_y_;            // macroblock position of the row
-  int filter_row_;      // true if row-filtering is needed
-  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
-  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
-  VP8Io io_;            // copy of the VP8Io to pass to put()
+  int id_;            // cache row to process (in [0..2])
+  int mb_y_;          // macroblock position of the row
+  int filter_row_;    // true if row-filtering is needed
+  VP8FInfo* f_info_;  // filter strengths
+  VP8Io io_;          // copy of the VP8Io to pass to put()
 } VP8ThreadContext;

-// Saved top samples, per macroblock. Fits into a cache-line.
-typedef struct {
-  uint8_t y[16], u[8], v[8];
-} VP8TopSamples;
-
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user

@@ -231,8 +198,7 @@ struct VP8Decoder {

  // Worker
  WebPWorker worker_;
-  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
-                       // 2=[parse][recon+filter]
+  int use_threads_;    // use multi-thread
  int cache_id_;       // current cache row
  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
  VP8ThreadContext thread_ctx_;  // Thread context
@@ -249,9 +215,12 @@ struct VP8Decoder {
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];

-  // Dithering strength, deduced from decoding options
-  int dither_;                // whether to use dithering or not
-  VP8Random dithering_rg_;    // random generator for dithering
+  // buffer refresh flags
+  //   bit 0: refresh Gold, bit 1: refresh Alt
+  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
+  //   bit 6: Gold sign bias, bit 7: Alt sign bias
+  //   bit 8: refresh last frame
+  uint32_t buffer_flags_;

  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@@ -260,19 +229,24 @@ struct VP8Decoder {
  VP8Proba proba_;
  int use_skip_proba_;
  uint8_t skip_p_;
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t intra_p_, last_p_, golden_p_;
+  VP8Proba proba_saved_;
+  int update_proba_;
+#endif

  // Boundary data cache and persistent buffers.
-  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
-  uint8_t  intra_l_[4];   // left intra modes values
+  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];  // left intra modes values
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each

-  uint8_t segment_;       // segment of the currently parsed block
-  VP8TopSamples* yuv_t_;  // top y/u/v samples
+  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;     // filter strength info
+  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
+  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4

-  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
-  VP8FInfo* f_info_;      // filter strength info
-  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
-
-  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
+  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
  uint8_t* cache_u_;
  uint8_t* cache_v_;
  int cache_y_stride_;
@@ -284,20 +258,29 @@ struct VP8Decoder {

  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
-  VP8MBData* mb_data_;    // parsed reconstruction data
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  uint8_t segment_;       // block's segment
+
+  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
+  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
+  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
+  // If the bit is set, the 4x4 block contains some non-zero coefficients.
+  uint32_t non_zero_;
+  uint32_t non_zero_ac_;

  // Filtering side-info
  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  int filter_row_;                           // per-row flag
  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type

-  // Alpha
-  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
-  const uint8_t* alpha_data_;     // compressed alpha data (if present)
+  // extensions
+  const uint8_t* alpha_data_;   // compressed alpha data (if present)
  size_t alpha_data_size_;
  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.

-  // extensions
  int layer_colorspace_;
  const uint8_t* layer_data_;   // compressed layer data (if present)
  size_t layer_data_size_;
@@ -320,6 +303,8 @@ void VP8ParseQuant(VP8Decoder* const dec);

 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+// Predict a block and add residual
+void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@@ -328,15 +313,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Return the multi-threading method to use (0=off), depending
-// on options and bitstream size. Only for lossy decoding.
-int VP8GetThreadMethod(const WebPDecoderOptions* const options,
-                       const WebPHeaderStructure* const headers,
-                       int width, int height);
-// Initialize dithering post-process if needed.
-void VP8InitDithering(const WebPDecoderOptions* const options,
-                      VP8Decoder* const dec);
-// Process the last decoded row (filtering + output).
+// Process the last decoded row (filtering + output)
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
@@ -352,7 +329,7 @@ int VP8DecodeLayer(VP8Decoder* const dec);

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/dec/vp8l.c
+++ b/src/dec/vp8l.c
@@ -14,14 +14,16 @@

 #include <stdio.h>
 #include <stdlib.h>
-#include "./alphai.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
 #include "../dsp/yuv.h"
-#include "../utils/alpha_processing.h"
 #include "../utils/huffman.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define NUM_ARGB_CACHE_ROWS          16

 static const int kCodeLengthLiterals = 16;
@@ -57,7 +59,7 @@ static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
 };

 #define CODE_TO_PLANE_CODES        120
-static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
+static const uint8_t code_to_plane_lut[CODE_TO_PLANE_CODES] = {
  0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
  0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
  0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
@@ -80,19 +82,20 @@ static int DecodeImageStream(int xsize, int ysize,
 //------------------------------------------------------------------------------

 int VP8LCheckSignature(const uint8_t* const data, size_t size) {
-  return (size >= VP8L_FRAME_HEADER_SIZE &&
-          data[0] == VP8L_MAGIC_BYTE &&
-          (data[4] >> 5) == 0);  // version
+  return (size >= 1) && (data[0] == VP8L_MAGIC_BYTE);
 }

 static int ReadImageInfo(VP8LBitReader* const br,
                         int* const width, int* const height,
                         int* const has_alpha) {
-  if (VP8LReadBits(br, 8) != VP8L_MAGIC_BYTE) return 0;
+  const uint8_t signature = VP8LReadBits(br, 8);
+  if (!VP8LCheckSignature(&signature, 1)) {
+    return 0;
+  }
  *width = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
  *height = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
  *has_alpha = VP8LReadBits(br, 1);
-  if (VP8LReadBits(br, VP8L_VERSION_BITS) != 0) return 0;
+  VP8LReadBits(br, VP8L_VERSION_BITS);  // Read/ignore the version number.
  return 1;
 }

@@ -100,8 +103,6 @@ int VP8LGetInfo(const uint8_t* data, size_t data_size,
                int* const width, int* const height, int* const has_alpha) {
  if (data == NULL || data_size < VP8L_FRAME_HEADER_SIZE) {
    return 0;         // not enough data
-  } else if (!VP8LCheckSignature(data, data_size)) {
-    return 0;         // bad signature
  } else {
    int w, h, a;
    VP8LBitReader br;
@@ -139,11 +140,11 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
  if (plane_code > CODE_TO_PLANE_CODES) {
    return plane_code - CODE_TO_PLANE_CODES;
  } else {
-    const int dist_code = kCodeToPlane[plane_code - 1];
+    const int dist_code = code_to_plane_lut[plane_code - 1];
    const int yoffset = dist_code >> 4;
    const int xoffset = 8 - (dist_code & 0xf);
    const int dist = yoffset * xsize + xoffset;
-    return (dist >= 1) ? dist : 1;  // dist<1 can happen if xsize is very small
+    return (dist >= 1) ? dist : 1;
  }
 }

@@ -154,27 +155,15 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
 static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
                                  VP8LBitReader* const br) {
  const HuffmanTreeNode* node = tree->root_;
+  int num_bits = 0;
  uint32_t bits = VP8LPrefetchBits(br);
-  int bitpos = br->bit_pos_;
-  // Check if we find the bit combination from the Huffman lookup table.
-  const int lut_ix = bits & (HUFF_LUT - 1);
-  const int lut_bits = tree->lut_bits_[lut_ix];
-  if (lut_bits <= HUFF_LUT_BITS) {
-    VP8LSetBitPos(br, bitpos + lut_bits);
-    return tree->lut_symbol_[lut_ix];
-  }
-  node += tree->lut_jump_[lut_ix];
-  bitpos += HUFF_LUT_BITS;
-  bits >>= HUFF_LUT_BITS;
-
-  // Decode the value from a binary tree.
  assert(node != NULL);
-  do {
+  while (!HuffmanTreeNodeIsLeaf(node)) {
    node = HuffmanTreeNextNode(node, bits & 1);
    bits >>= 1;
-    ++bitpos;
-  } while (HuffmanTreeNodeIsNotLeaf(node));
-  VP8LSetBitPos(br, bitpos);
+    ++num_bits;
+  }
+  VP8LDiscardBits(br, num_bits);
  return node->symbol_;
 }

@@ -415,13 +404,12 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                  int rgba_stride, uint8_t* const rgba) {
-  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
  const int dst_width = rescaler->dst_width;
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
    WebPRescalerExportRow(rescaler);
-    WebPMultARGBRow(src, dst_width, 1);
    VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
    ++num_lines_out;
  }
@@ -429,22 +417,18 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
 }

 // Emit scaled rows.
-static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
-                                uint8_t* in, int in_stride, int mb_h,
-                                uint8_t* const out, int out_stride) {
+static int EmitRescaledRows(const VP8LDecoder* const dec,
+                            const uint32_t* const data, int in_stride, int mb_h,
+                            uint8_t* const out, int out_stride) {
  const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
+  const uint8_t* const in = (const uint8_t*)data;
  int num_lines_in = 0;
  int num_lines_out = 0;
  while (num_lines_in < mb_h) {
-    uint8_t* const row_in = in + num_lines_in * in_stride;
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
    uint8_t* const row_out = out + num_lines_out * out_stride;
-    const int lines_left = mb_h - num_lines_in;
-    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
-    assert(needed_lines > 0 && needed_lines <= lines_left);
-    WebPMultARGBRows(row_in, in_stride,
-                     dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
-    num_lines_in += needed_lines;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
    num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
  }
  return num_lines_out;
@@ -452,10 +436,11 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,

 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
-                    const uint8_t* row_in, int in_stride,
+                    const uint32_t* const data, int in_stride,
                    int mb_w, int mb_h,
                    uint8_t* const out, int out_stride) {
  int lines = mb_h;
+  const uint8_t* row_in = (const uint8_t*)data;
  uint8_t* row_out = out;
  while (lines-- > 0) {
    VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
@@ -477,8 +462,7 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
    uint8_t* const y = buf->y + y_pos * buf->y_stride;
    for (i = 0; i < width; ++i) {
      const uint32_t p = src[i];
-      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
-                       YUV_HALF);
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
    }
  }

@@ -497,11 +481,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
      const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
      const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
      if (!(y_pos & 1)) {  // even lines: store values
-        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
      } else {             // odd lines: average with previous values
-        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
        // Approximated average-of-four. But it's an acceptable diff.
        u[i] = (u[i] + tmp_u + 1) >> 1;
        v[i] = (v[i] + tmp_v + 1) >> 1;
@@ -513,11 +497,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
      const int g = (v0 >>  6) & 0x3fc;
      const int b = (v0 <<  2) & 0x3fc;
      if (!(y_pos & 1)) {  // even lines
-        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        u[i] = VP8RGBToU(r, g, b);
+        v[i] = VP8RGBToV(r, g, b);
      } else {             // odd lines (note: we could just skip this)
-        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
-        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        const int tmp_u = VP8RGBToU(r, g, b);
+        const int tmp_v = VP8RGBToV(r, g, b);
        u[i] = (u[i] + tmp_u + 1) >> 1;
        v[i] = (v[i] + tmp_v + 1) >> 1;
      }
@@ -533,12 +517,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,

 static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
  WebPRescaler* const rescaler = dec->rescaler;
-  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const uint32_t* const src = (const uint32_t*)rescaler->dst;
  const int dst_width = rescaler->dst_width;
  int num_lines_out = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
    WebPRescalerExportRow(rescaler);
-    WebPMultARGBRow(src, dst_width, 1);
    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
    ++y_pos;
    ++num_lines_out;
@@ -547,28 +530,28 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
 }

 static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
-                                uint8_t* in, int in_stride, int mb_h) {
+                                const uint32_t* const data,
+                                int in_stride, int mb_h) {
+  const uint8_t* const in = (const uint8_t*)data;
  int num_lines_in = 0;
  int y_pos = dec->last_out_row_;
  while (num_lines_in < mb_h) {
-    const int lines_left = mb_h - num_lines_in;
-    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
-    WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
-    num_lines_in += needed_lines;
-    in += needed_lines * in_stride;
+    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
+                                       row_in, in_stride);
    y_pos += ExportYUVA(dec, y_pos);
  }
  return y_pos;
 }

 static int EmitRowsYUVA(const VP8LDecoder* const dec,
-                        const uint8_t* in, int in_stride,
+                        const uint32_t* const data, int in_stride,
                        int mb_w, int num_rows) {
  int y_pos = dec->last_out_row_;
+  const uint8_t* row_in = (const uint8_t*)data;
  while (num_rows-- > 0) {
-    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output_);
-    in += in_stride;
+    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
+    row_in += in_stride;
    ++y_pos;
  }
  return y_pos;
@@ -579,11 +562,11 @@ static int EmitRowsYUVA(const VP8LDecoder* const dec,

 // Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
 // crop options. Also updates the input data pointer, so that it points to the
-// start of the cropped window. Note that pixels are in ARGB format even if
-// 'in_data' is uint8_t*.
+// start of the cropped window.
+// Note that 'pixel_stride' is in units of 'uint32_t' (and not 'bytes).
 // Returns true if the crop window is not empty.
 static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
-                         uint8_t** const in_data, int pixel_stride) {
+                         const uint32_t** const in_data, int pixel_stride) {
  assert(y_start < y_end);
  assert(io->crop_left < io->crop_right);
  if (y_end > io->crop_bottom) {
@@ -592,11 +575,11 @@ static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
  if (y_start < io->crop_top) {
    const int delta = io->crop_top - y_start;
    y_start = io->crop_top;
-    *in_data += delta * pixel_stride;
+    *in_data += pixel_stride * delta;
  }
  if (y_start >= y_end) return 0;  // Crop window is empty.

-  *in_data += io->crop_left * sizeof(uint32_t);
+  *in_data += io->crop_left;

  io->mb_y = y_start - io->crop_top;
  io->mb_w = io->crop_right - io->crop_left;
@@ -670,18 +653,18 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
  // Emit output.
  {
    VP8Io* const io = dec->io_;
-    uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
-    const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
-    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
+    const uint32_t* rows_data = dec->argb_cache_;
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
      // Nothing to output (this time).
    } else {
      const WebPDecBuffer* const output = dec->output_;
+      const int in_stride = io->width * sizeof(*rows_data);
      if (output->colorspace < MODE_YUV) {  // convert to RGBA
        const WebPRGBABuffer* const buf = &output->u.RGBA;
        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
        const int num_rows_out = io->use_scaling ?
-            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
-                                 rgba, buf->stride) :
+            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
+                             rgba, buf->stride) :
            EmitRows(output->colorspace, rows_data, in_stride,
                     io->mb_w, io->mb_h, rgba, buf->stride);
        // Update 'last_out_row_'.
@@ -700,232 +683,134 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
  assert(dec->last_row_ <= dec->height_);
 }

-// Row-processing for the special case when alpha data contains only one
-// transform (color indexing), and trivial non-green literals.
-static int Is8bOptimizable(const VP8LMetadata* const hdr) {
-  int i;
-  if (hdr->color_cache_size_ > 0) return 0;
-  // When the Huffman tree contains only one symbol, we can skip the
-  // call to ReadSymbol() for red/blue/alpha channels.
-  for (i = 0; i < hdr->num_htree_groups_; ++i) {
-    const HuffmanTree* const htrees = hdr->htree_groups_[i].htrees_;
-    if (htrees[RED].num_nodes_ > 1) return 0;
-    if (htrees[BLUE].num_nodes_ > 1) return 0;
-    if (htrees[ALPHA].num_nodes_ > 1) return 0;
-  }
-  return 1;
+#define DECODE_DATA_FUNC(FUNC_NAME, TYPE, STORE_PIXEL)                         \
+static int FUNC_NAME(VP8LDecoder* const dec, TYPE* const data, int width,      \
+                     int height, ProcessRowsFunc process_func) {               \
+  int ok = 1;                                                                  \
+  int col = 0, row = 0;                                                        \
+  VP8LBitReader* const br = &dec->br_;                                         \
+  VP8LMetadata* const hdr = &dec->hdr_;                                        \
+  HTreeGroup* htree_group = hdr->htree_groups_;                                \
+  TYPE* src = data;                                                            \
+  TYPE* last_cached = data;                                                    \
+  TYPE* const src_end = data + width * height;                                 \
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;             \
+  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;       \
+  VP8LColorCache* const color_cache =                                          \
+      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;                \
+  const int mask = hdr->huffman_mask_;                                         \
+  assert(htree_group != NULL);                                                 \
+  while (!br->eos_ && src < src_end) {                                         \
+    int code;                                                                  \
+    /* Only update when changing tile. Note we could use this test:        */  \
+    /* if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed */  \
+    /* but that's actually slower and needs storing the previous col/row.  */  \
+    if ((col & mask) == 0) {                                                   \
+      htree_group = GetHtreeGroupForPos(hdr, col, row);                        \
+    }                                                                          \
+    VP8LFillBitWindow(br);                                                     \
+    code = ReadSymbol(&htree_group->htrees_[GREEN], br);                       \
+    if (code < NUM_LITERAL_CODES) {  /* Literal*/                              \
+      int red, green, blue, alpha;                                             \
+      red = ReadSymbol(&htree_group->htrees_[RED], br);                        \
+      green = code;                                                            \
+      VP8LFillBitWindow(br);                                                   \
+      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);                      \
+      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);                    \
+      *src = STORE_PIXEL(alpha, red, green, blue);                             \
+    AdvanceByOne:                                                              \
+      ++src;                                                                   \
+      ++col;                                                                   \
+      if (col >= width) {                                                      \
+        col = 0;                                                               \
+        ++row;                                                                 \
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {      \
+          process_func(dec, row);                                              \
+        }                                                                      \
+        if (color_cache != NULL) {                                             \
+          while (last_cached < src) {                                          \
+            VP8LColorCacheInsert(color_cache, *last_cached++);                 \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+    } else if (code < len_code_limit) {  /* Backward reference */              \
+      int dist_code, dist;                                                     \
+      const int length_sym = code - NUM_LITERAL_CODES;                         \
+      const int length = GetCopyLength(length_sym, br);                        \
+      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);     \
+      VP8LFillBitWindow(br);                                                   \
+      dist_code = GetCopyDistance(dist_symbol, br);                            \
+      dist = PlaneCodeToDistance(width, dist_code);                            \
+      if (src - data < dist || src_end - src < length) {                       \
+        ok = 0;                                                                \
+        goto End;                                                              \
+      }                                                                        \
+      {                                                                        \
+        int i;                                                                 \
+        for (i = 0; i < length; ++i) src[i] = src[i - dist];                   \
+        src += length;                                                         \
+      }                                                                        \
+      col += length;                                                           \
+      while (col >= width) {                                                   \
+        col -= width;                                                          \
+        ++row;                                                                 \
+        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {      \
+          process_func(dec, row);                                              \
+        }                                                                      \
+      }                                                                        \
+      if (src < src_end) {                                                     \
+        htree_group = GetHtreeGroupForPos(hdr, col, row);                      \
+        if (color_cache != NULL) {                                             \
+          while (last_cached < src) {                                          \
+            VP8LColorCacheInsert(color_cache, *last_cached++);                 \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+    } else if (code < color_cache_limit) {  /* Color cache */                  \
+      const int key = code - len_code_limit;                                   \
+      assert(color_cache != NULL);                                             \
+      while (last_cached < src) {                                              \
+        VP8LColorCacheInsert(color_cache, *last_cached++);                     \
+      }                                                                        \
+      *src = VP8LColorCacheLookup(color_cache, key);                           \
+      goto AdvanceByOne;                                                       \
+    } else {  /* Not reached */                                                \
+      ok = 0;                                                                  \
+      goto End;                                                                \
+    }                                                                          \
+    ok = !br->error_;                                                          \
+    if (!ok) goto End;                                                         \
+  }                                                                            \
+  /* Process the remaining rows corresponding to last row-block. */            \
+  if (process_func != NULL) process_func(dec, row);                            \
+End:                                                                           \
+  if (br->error_ || !ok || (br->eos_ && src < src_end)) {                      \
+    ok = 0;                                                                    \
+    dec->status_ =                                                             \
+        (!br->eos_) ? VP8_STATUS_BITSTREAM_ERROR : VP8_STATUS_SUSPENDED;       \
+  } else if (src == src_end) {                                                 \
+    dec->state_ = READ_DATA;                                                   \
+  }                                                                            \
+  return ok;                                                                   \
 }

-static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
-  const int num_rows = row - dec->last_row_;
-  const uint8_t* const in =
-      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
-  if (num_rows > 0) {
-    ApplyInverseTransformsAlpha(dec, num_rows, in);
-  }
-  dec->last_row_ = dec->last_out_row_ = row;
+static WEBP_INLINE uint32_t GetARGBPixel(int alpha, int red, int green,
+                                         int blue) {
+  return (alpha << 24) | (red << 16) | (green << 8) | blue;
 }

-static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
-                           int width, int height, int last_row) {
-  int ok = 1;
-  int row = dec->last_pixel_ / width;
-  int col = dec->last_pixel_ % width;
-  VP8LBitReader* const br = &dec->br_;
-  VP8LMetadata* const hdr = &dec->hdr_;
-  const HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
-  int pos = dec->last_pixel_;         // current position
-  const int end = width * height;     // End of data
-  const int last = width * last_row;  // Last pixel to decode
-  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
-  const int mask = hdr->huffman_mask_;
-  assert(htree_group != NULL);
-  assert(last_row <= height);
-  assert(Is8bOptimizable(hdr));
-
-  while (!br->eos_ && pos < last) {
-    int code;
-    // Only update when changing tile.
-    if ((col & mask) == 0) {
-      htree_group = GetHtreeGroupForPos(hdr, col, row);
-    }
-    VP8LFillBitWindow(br);
-    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
-    if (code < NUM_LITERAL_CODES) {  // Literal
-      data[pos] = code;
-      ++pos;
-      ++col;
-      if (col >= width) {
-        col = 0;
-        ++row;
-        if (row % NUM_ARGB_CACHE_ROWS == 0) {
-          ExtractPalettedAlphaRows(dec, row);
-        }
-      }
-    } else if (code < len_code_limit) {  // Backward reference
-      int dist_code, dist;
-      const int length_sym = code - NUM_LITERAL_CODES;
-      const int length = GetCopyLength(length_sym, br);
-      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
-      VP8LFillBitWindow(br);
-      dist_code = GetCopyDistance(dist_symbol, br);
-      dist = PlaneCodeToDistance(width, dist_code);
-      if (pos >= dist && end - pos >= length) {
-        int i;
-        for (i = 0; i < length; ++i) data[pos + i] = data[pos + i - dist];
-      } else {
-        ok = 0;
-        goto End;
-      }
-      pos += length;
-      col += length;
-      while (col >= width) {
-        col -= width;
-        ++row;
-        if (row % NUM_ARGB_CACHE_ROWS == 0) {
-          ExtractPalettedAlphaRows(dec, row);
-        }
-      }
-      if (pos < last && (col & mask)) {
-        htree_group = GetHtreeGroupForPos(hdr, col, row);
-      }
-    } else {  // Not reached
-      ok = 0;
-      goto End;
-    }
-    ok = !br->error_;
-    if (!ok) goto End;
-  }
-  // Process the remaining rows corresponding to last row-block.
-  ExtractPalettedAlphaRows(dec, row);
-
- End:
-  if (br->error_ || !ok || (br->eos_ && pos < end)) {
-    ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = (int)pos;
-    if (pos == end) dec->state_ = READ_DATA;
-  }
-  return ok;
+static WEBP_INLINE uint8_t GetAlphaPixel(int alpha, int red, int green,
+                                         int blue) {
+  (void)alpha;
+  (void)red;
+  (void)blue;
+  return green;  // Alpha value is stored in green channel.
 }

-static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
-                           int width, int height, int last_row,
-                           ProcessRowsFunc process_func) {
-  int ok = 1;
-  int row = dec->last_pixel_ / width;
-  int col = dec->last_pixel_ % width;
-  VP8LBitReader* const br = &dec->br_;
-  VP8LMetadata* const hdr = &dec->hdr_;
-  HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
-  uint32_t* src = data + dec->last_pixel_;
-  uint32_t* last_cached = src;
-  uint32_t* const src_end = data + width * height;     // End of data
-  uint32_t* const src_last = data + width * last_row;  // Last pixel to decode
-  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
-  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
-  VP8LColorCache* const color_cache =
-      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
-  const int mask = hdr->huffman_mask_;
-  assert(htree_group != NULL);
-  assert(src_last <= src_end);
+DECODE_DATA_FUNC(DecodeImageData, uint32_t, GetARGBPixel)
+DECODE_DATA_FUNC(DecodeAlphaData, uint8_t, GetAlphaPixel)

-  while (!br->eos_ && src < src_last) {
-    int code;
-    // Only update when changing tile. Note we could use this test:
-    // if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
-    // but that's actually slower and needs storing the previous col/row.
-    if ((col & mask) == 0) {
-      htree_group = GetHtreeGroupForPos(hdr, col, row);
-    }
-    VP8LFillBitWindow(br);
-    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
-    if (code < NUM_LITERAL_CODES) {  // Literal
-      int red, green, blue, alpha;
-      red = ReadSymbol(&htree_group->htrees_[RED], br);
-      green = code;
-      VP8LFillBitWindow(br);
-      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
-      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
-      *src = (alpha << 24) | (red << 16) | (green << 8) | blue;
-    AdvanceByOne:
-      ++src;
-      ++col;
-      if (col >= width) {
-        col = 0;
-        ++row;
-        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
-          process_func(dec, row);
-        }
-        if (color_cache != NULL) {
-          while (last_cached < src) {
-            VP8LColorCacheInsert(color_cache, *last_cached++);
-          }
-        }
-      }
-    } else if (code < len_code_limit) {  // Backward reference
-      int dist_code, dist;
-      const int length_sym = code - NUM_LITERAL_CODES;
-      const int length = GetCopyLength(length_sym, br);
-      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
-      VP8LFillBitWindow(br);
-      dist_code = GetCopyDistance(dist_symbol, br);
-      dist = PlaneCodeToDistance(width, dist_code);
-      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
-        ok = 0;
-        goto End;
-      } else {
-        int i;
-        for (i = 0; i < length; ++i) src[i] = src[i - dist];
-        src += length;
-      }
-      col += length;
-      while (col >= width) {
-        col -= width;
-        ++row;
-        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
-          process_func(dec, row);
-        }
-      }
-      if (src < src_last) {
-        if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
-        if (color_cache != NULL) {
-          while (last_cached < src) {
-            VP8LColorCacheInsert(color_cache, *last_cached++);
-          }
-        }
-      }
-    } else if (code < color_cache_limit) {  // Color cache
-      const int key = code - len_code_limit;
-      assert(color_cache != NULL);
-      while (last_cached < src) {
-        VP8LColorCacheInsert(color_cache, *last_cached++);
-      }
-      *src = VP8LColorCacheLookup(color_cache, key);
-      goto AdvanceByOne;
-    } else {  // Not reached
-      ok = 0;
-      goto End;
-    }
-    ok = !br->error_;
-    if (!ok) goto End;
-  }
-  // Process the remaining rows corresponding to last row-block.
-  if (process_func != NULL) process_func(dec, row);
-
- End:
-  if (br->error_ || !ok || (br->eos_ && src < src_end)) {
-    ok = 0;
-    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
-                            : VP8_STATUS_BITSTREAM_ERROR;
-  } else {
-    dec->last_pixel_ = (int)(src - data);
-    if (src == src_end) dec->state_ = READ_DATA;
-  }
-  return ok;
-}
+#undef DECODE_DATA_FUNC

 // -----------------------------------------------------------------------------
 // VP8LTransform
@@ -1040,9 +925,6 @@ VP8LDecoder* VP8LNew(void) {
  dec->status_ = VP8_STATUS_OK;
  dec->action_ = READ_DIM;
  dec->state_ = READ_DIM;
-
-  VP8LDspInit();  // Init critical function pointers.
-
  return dec;
 }

@@ -1148,8 +1030,7 @@ static int DecodeImageStream(int xsize, int ysize,
  }

  // Use the Huffman trees to decode the LZ77 encoded data.
-  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize,
-                       transform_ysize, NULL);
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
  ok = ok && !br->error_;

 End:
@@ -1171,7 +1052,6 @@ static int DecodeImageStream(int xsize, int ysize,
      assert(data == NULL);
      assert(is_level0);
    }
-    dec->last_pixel_ = 0;  // Reset for future DECODE_DATA_FUNC() calls.
    if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
  }
  return ok;
@@ -1179,35 +1059,29 @@ static int DecodeImageStream(int xsize, int ysize,

 //------------------------------------------------------------------------------
 // Allocate internal buffers dec->pixels_ and dec->argb_cache_.
-static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
+static int AllocateInternalBuffers(VP8LDecoder* const dec, int final_width,
+                                   size_t bytes_per_pixel) {
+  const int argb_cache_needed = (bytes_per_pixel == sizeof(uint32_t));
  const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
  // Scratch buffer corresponding to top-prediction row for transforming the
  // first row in the row-blocks. Not needed for paletted alpha.
-  const uint64_t cache_top_pixels = (uint16_t)final_width;
+  const uint64_t cache_top_pixels =
+      argb_cache_needed ? (uint16_t)final_width : 0ULL;
  // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
-  const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
+  const uint64_t cache_pixels =
+      argb_cache_needed ? (uint64_t)final_width * NUM_ARGB_CACHE_ROWS : 0ULL;
  const uint64_t total_num_pixels =
      num_pixels + cache_top_pixels + cache_pixels;

  assert(dec->width_ <= final_width);
-  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
+  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, bytes_per_pixel);
  if (dec->pixels_ == NULL) {
    dec->argb_cache_ = NULL;    // for sanity check
    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
    return 0;
  }
-  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
-  return 1;
-}
-
-static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
-  const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
-  dec->argb_cache_ = NULL;    // for sanity check
-  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
-  if (dec->pixels_ == NULL) {
-    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
-    return 0;
-  }
+  dec->argb_cache_ =
+      argb_cache_needed ? dec->pixels_ + num_pixels + cache_top_pixels : NULL;
  return 1;
 }

@@ -1233,73 +1107,64 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
  dec->last_row_ = dec->last_out_row_ = row;
 }

-int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size,
-                          uint8_t* const output) {
+// Row-processing for the special case when alpha data contains only one
+// transform: color indexing.
+static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint8_t* const in =
+      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
+  if (num_rows <= 0) return;  // Nothing to be done.
+  ApplyInverseTransformsAlpha(dec, num_rows, in);
+  dec->last_row_ = dec->last_out_row_ = row;
+}
+
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output) {
+  VP8Io io;
  int ok = 0;
-  VP8LDecoder* dec;
-  VP8Io* io;
-  assert(alph_dec != NULL);
-  alph_dec->vp8l_dec_ = VP8LNew();
-  if (alph_dec->vp8l_dec_ == NULL) return 0;
-  dec = alph_dec->vp8l_dec_;
+  VP8LDecoder* const dec = VP8LNew();
+  size_t bytes_per_pixel = sizeof(uint32_t);  // Default: BGRA mode.
+  if (dec == NULL) return 0;

-  dec->width_ = alph_dec->width_;
-  dec->height_ = alph_dec->height_;
-  dec->io_ = &alph_dec->io_;
-  io = dec->io_;
+  dec->width_ = width;
+  dec->height_ = height;
+  dec->io_ = &io;

-  VP8InitIo(io);
-  WebPInitCustomIo(NULL, io);  // Just a sanity Init. io won't be used.
-  io->opaque = output;
-  io->width = alph_dec->width_;
-  io->height = alph_dec->height_;
+  VP8InitIo(&io);
+  WebPInitCustomIo(NULL, &io);    // Just a sanity Init. io won't be used.
+  io.opaque = output;
+  io.width = width;
+  io.height = height;

  dec->status_ = VP8_STATUS_OK;
  VP8LInitBitReader(&dec->br_, data, data_size);

  dec->action_ = READ_HDR;
-  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
-    goto Err;
-  }
+  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Err;

  // Special case: if alpha data uses only the color indexing transform and
  // doesn't use color cache (a frequent case), we will use DecodeAlphaData()
  // method that only needs allocation of 1 byte per pixel (alpha channel).
  if (dec->next_transform_ == 1 &&
      dec->transforms_[0].type_ == COLOR_INDEXING_TRANSFORM &&
-      Is8bOptimizable(&dec->hdr_)) {
-    alph_dec->use_8b_decode = 1;
-    ok = AllocateInternalBuffers8b(dec);
-  } else {
-    // Allocate internal buffers (note that dec->width_ may have changed here).
-    alph_dec->use_8b_decode = 0;
-    ok = AllocateInternalBuffers32b(dec, alph_dec->width_);
+      dec->hdr_.color_cache_size_ == 0) {
+    bytes_per_pixel = sizeof(uint8_t);
  }

-  if (!ok) goto Err;
-
-  dec->action_ = READ_DATA;
-  return 1;
-
- Err:
-  VP8LDelete(alph_dec->vp8l_dec_);
-  alph_dec->vp8l_dec_ = NULL;
-  return 0;
-}
-
-int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
-  VP8LDecoder* const dec = alph_dec->vp8l_dec_;
-  assert(dec != NULL);
-  assert(dec->action_ == READ_DATA);
-  assert(last_row <= dec->height_);
+  // Allocate internal buffers (note that dec->width_ may have changed here).
+  if (!AllocateInternalBuffers(dec, width, bytes_per_pixel)) goto Err;

  // Decode (with special row processing).
-  return alph_dec->use_8b_decode ?
+  dec->action_ = READ_DATA;
+  ok = (bytes_per_pixel == sizeof(uint8_t)) ?
      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
-                      last_row) :
+                      ExtractPalettedAlphaRows) :
      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                      last_row, ExtractAlphaRows);
+                      ExtractAlphaRows);
+
+ Err:
+  VP8LDelete(dec);
+  return ok;
 }

 //------------------------------------------------------------------------------
@@ -1335,6 +1200,7 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
 }

 int VP8LDecodeImage(VP8LDecoder* const dec) {
+  const size_t bytes_per_pixel = sizeof(uint32_t);
  VP8Io* io = NULL;
  WebPDecParams* params = NULL;

@@ -1354,14 +1220,14 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
    goto Err;
  }

-  if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
+  if (!AllocateInternalBuffers(dec, io->width, bytes_per_pixel)) goto Err;

  if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;

  // Decode.
  dec->action_ = READ_DATA;
  if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                       dec->height_, ProcessRows)) {
+                       ProcessRows)) {
    goto Err;
  }

@@ -1378,3 +1244,6 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/vp8li.h
+++ b/src/dec/vp8li.h
@@ -22,7 +22,7 @@
 #include "../utils/huffman.h"
 #include "../webp/format_constants.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -57,8 +57,7 @@ typedef struct {
  HTreeGroup     *htree_groups_;
 } VP8LMetadata;

-typedef struct VP8LDecoder VP8LDecoder;
-struct VP8LDecoder {
+typedef struct {
  VP8StatusCode    status_;
  VP8LDecodeState  action_;
  VP8LDecodeState  state_;
@@ -75,9 +74,6 @@ struct VP8LDecoder {
  int              width_;
  int              height_;
  int              last_row_;      // last input row decoded so far.
-  int              last_pixel_;    // last pixel decoded so far. However, it may
-                                   // not be transformed, scaled and
-                                   // color-converted yet.
  int              last_out_row_;  // last row output so far.

  VP8LMetadata     hdr_;
@@ -89,27 +85,18 @@ struct VP8LDecoder {

  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
  WebPRescaler    *rescaler;         // Common rescaler for all channels.
-};
+} VP8LDecoder;

 //------------------------------------------------------------------------------
 // internal functions. Not public.

-struct ALPHDecoder;  // Defined in dec/alphai.h.
-
 // in vp8l.c

-// Decodes image header for alpha data stored using lossless compression.
-// Returns false in case of error.
-int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
-                          const uint8_t* const data, size_t data_size,
-                          uint8_t* const output);
-
-// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
-// already decoded in previous call(s), it will resume decoding from where it
-// was paused.
-// Returns false in case of bitstream error.
-int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
-                               int last_row);
+// Decodes a raw image stream (without header) and store the alpha data
+// into *output, which must be of size width x height. Returns false in case
+// of error.
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output);

 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@@ -130,7 +117,7 @@ void VP8LDelete(VP8LDecoder* const dec);

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/dec/webp.c
+++ b/src/dec/webp.c
@@ -18,6 +18,10 @@
 #include "./webpi.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // RIFF layout is:
 //   Offset  tag
@@ -281,17 +285,9 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                          int* const height,
                                          int* const has_alpha,
                                          int* const has_animation,
-                                          int* const format,
                                          WebPHeaderStructure* const headers) {
-  int canvas_width = 0;
-  int canvas_height = 0;
-  int image_width = 0;
-  int image_height = 0;
  int found_riff = 0;
  int found_vp8x = 0;
-  int animation_present = 0;
-  int fragments_present = 0;
-
  VP8StatusCode status;
  WebPHeaderStructure hdrs;

@@ -312,35 +308,23 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  // Skip over VP8X.
  {
    uint32_t flags = 0;
-    status = ParseVP8X(&data, &data_size, &found_vp8x,
-                       &canvas_width, &canvas_height, &flags);
+    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
    if (status != VP8_STATUS_OK) {
      return status;  // Wrong VP8X / insufficient data.
    }
-    animation_present = !!(flags & ANIMATION_FLAG);
-    fragments_present = !!(flags & FRAGMENTS_FLAG);
    if (!found_riff && found_vp8x) {
      // Note: This restriction may be removed in the future, if it becomes
      // necessary to send VP8X chunk to the decoder.
      return VP8_STATUS_BITSTREAM_ERROR;
    }
    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
-    if (has_animation != NULL) *has_animation = animation_present;
-    if (format != NULL) *format = 0;   // default = undefined
-
-    image_width = canvas_width;
-    image_height = canvas_height;
-    if (found_vp8x && (animation_present || fragments_present) &&
-        headers == NULL) {
-      status = VP8_STATUS_OK;
-      goto ReturnWidthHeight;  // Just return features from VP8X header.
+    if (has_animation != NULL) *has_animation = !!(flags & ANIMATION_FLAG);
+    if (found_vp8x && headers == NULL) {
+      return VP8_STATUS_OK;  // Return features from VP8X header.
    }
  }

-  if (data_size < TAG_SIZE) {
-    status = VP8_STATUS_NOT_ENOUGH_DATA;
-    goto ReturnWidthHeight;
-  }
+  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;

  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
  if ((found_riff && found_vp8x) ||
@@ -348,7 +332,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
    if (status != VP8_STATUS_OK) {
-      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
+      return status;  // Found an invalid chunk size / insufficient data.
    }
  }

@@ -356,41 +340,35 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
                          &hdrs.compressed_size, &hdrs.is_lossless);
  if (status != VP8_STATUS_OK) {
-    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
+    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
  }
  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
    return VP8_STATUS_BITSTREAM_ERROR;
  }

-  if (format != NULL && !(animation_present || fragments_present)) {
-    *format = hdrs.is_lossless ? 2 : 1;
-  }
-
  if (!hdrs.is_lossless) {
    if (data_size < VP8_FRAME_HEADER_SIZE) {
-      status = VP8_STATUS_NOT_ENOUGH_DATA;
-      goto ReturnWidthHeight;
+      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
    // Validates raw VP8 data.
-    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
-                    &image_width, &image_height)) {
+    if (!VP8GetInfo(data, data_size,
+                    (uint32_t)hdrs.compressed_size, width, height)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  } else {
    if (data_size < VP8L_FRAME_HEADER_SIZE) {
-      status = VP8_STATUS_NOT_ENOUGH_DATA;
-      goto ReturnWidthHeight;
+      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
    // Validates raw VP8L data.
-    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
+    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  }
-  // Validates image size coherency.
-  if (found_vp8x) {
-    if (canvas_width != image_width || canvas_height != image_height) {
-      return VP8_STATUS_BITSTREAM_ERROR;
-    }
+
+  if (has_alpha != NULL) {
+    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    // to set this is by looking for alpha data (from an ALPH chunk).
+    *has_alpha |= (hdrs.alpha_data != NULL);
  }
  if (headers != NULL) {
    *headers = hdrs;
@@ -398,20 +376,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
    assert(headers->offset == headers->data_size - data_size);
  }
- ReturnWidthHeight:
-  if (status == VP8_STATUS_OK ||
-      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
-    if (has_alpha != NULL) {
-      // If the data did not contain a VP8X/VP8L chunk the only definitive way
-      // to set this is by looking for alpha data (from an ALPH chunk).
-      *has_alpha |= (hdrs.alpha_data != NULL);
-    }
-    if (width != NULL) *width = image_width;
-    if (height != NULL) *height = image_height;
-    return VP8_STATUS_OK;
-  } else {
-    return status;
-  }
+  return VP8_STATUS_OK;  // Return features from VP8 header.
 }

 VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
@@ -420,8 +385,7 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
  assert(headers != NULL);
  // fill out headers, ignore width/height/has_alpha.
  status = ParseHeadersInternal(headers->data, headers->data_size,
-                                NULL, NULL, NULL, &has_animation,
-                                NULL, headers);
+                                NULL, NULL, NULL, &has_animation, headers);
  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    // TODO(jzern): full support of animation frames will require API additions.
    if (has_animation) {
@@ -435,7 +399,7 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
 // WebPDecParams

 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params != NULL) {
+  if (params) {
    memset(params, 0, sizeof(*params));
  }
 }
@@ -468,6 +432,11 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = params->options && (params->options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;

@@ -479,10 +448,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                     params->output);
      if (status == VP8_STATUS_OK) {  // Decode
-        // This change must be done before calling VP8Decode()
-        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
-                                             io.width, io.height);
-        VP8InitDithering(params->options, dec);
        if (!VP8Decode(dec, &io)) {
          status = dec->status_;
        }
@@ -669,6 +634,7 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
  assert(features != NULL);
  memset(features, 0, sizeof(*features));
+  features->bitstream_version = 0;
 }

 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@@ -682,7 +648,7 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
  return ParseHeadersInternal(data, data_size,
                              &features->width, &features->height,
                              &features->has_alpha, &features->has_animation,
-                              &features->format, NULL);
+                              NULL);
 }

 //------------------------------------------------------------------------------
@@ -820,3 +786,6 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@@ -14,7 +14,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -109,7 +109,7 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/demux/Makefile.am
+++ b/src/demux/Makefile.am
@@ -10,6 +10,6 @@ libwebpdemuxinclude_HEADERS += ../webp/mux_types.h
 libwebpdemuxinclude_HEADERS += ../webp/types.h

 libwebpdemux_la_LIBADD = ../libwebp.la
-libwebpdemux_la_LDFLAGS = -no-undefined -version-info 1:0:0
+libwebpdemux_la_LDFLAGS = -no-undefined -version-info 0:1:0
 libwebpdemuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpdemux.pc
--- a/src/demux/demux.c
+++ b/src/demux/demux.c
@@ -23,9 +23,13 @@
 #include "../webp/demux.h"
 #include "../webp/format_constants.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 0
+#define DMUX_MIN_VERSION 1
+#define DMUX_REV_VERSION 1

 typedef struct {
  size_t start_;        // start location of the data
@@ -43,10 +47,8 @@ typedef struct {
 typedef struct Frame {
  int x_offset_, y_offset_;
  int width_, height_;
-  int has_alpha_;
  int duration_;
  WebPMuxAnimDispose dispose_method_;
-  WebPMuxAnimBlend blend_method_;
  int is_fragment_;  // this is a frame fragment (and not a full frame).
  int frame_num_;  // the referent frame number for use in assembling fragments.
  int complete_;   // img_components_ contains a full image.
@@ -71,7 +73,6 @@ struct WebPDemuxer {
  Frame* frames_;
  Frame** frames_tail_;
  Chunk* chunks_;  // non-image chunks
-  Chunk** chunks_tail_;
 };

 typedef enum {
@@ -176,9 +177,10 @@ static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
 // Secondary chunk parsing

 static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
-  *dmux->chunks_tail_ = chunk;
+  Chunk** c = &dmux->chunks_;
+  while (*c != NULL) c = &(*c)->next_;
+  *c = chunk;
  chunk->next_ = NULL;
-  dmux->chunks_tail_ = &chunk->next_;
 }

 // Add a frame to the end of the list, ensuring the last frame is complete.
@@ -194,13 +196,18 @@ static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
 }

 // Store image bearing chunks to 'frame'.
+// If 'has_vp8l_alpha' is not NULL, it will be set to true if the frame is a
+// lossless image with alpha.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
-                              MemBuffer* const mem, Frame* const frame) {
+                              MemBuffer* const mem, Frame* const frame,
+                              int* const has_vp8l_alpha) {
  int alpha_chunks = 0;
  int image_chunks = 0;
  int done = (MemDataSize(mem) < min_size);
  ParseStatus status = PARSE_OK;

+  if (has_vp8l_alpha != NULL) *has_vp8l_alpha = 0;  // Default.
+
  if (done) return PARSE_NEED_MORE_DATA;

  do {
@@ -222,7 +229,6 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
          ++alpha_chunks;
          frame->img_components_[1].offset_ = chunk_start_offset;
          frame->img_components_[1].size_ = chunk_size;
-          frame->has_alpha_ = 1;
          frame->frame_num_ = frame_num;
          Skip(mem, payload_available);
        } else {
@@ -252,7 +258,7 @@ static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
          frame->img_components_[0].size_ = chunk_size;
          frame->width_ = features.width;
          frame->height_ = features.height;
-          frame->has_alpha_ |= features.has_alpha;
+          if (has_vp8l_alpha != NULL) *has_vp8l_alpha = features.has_alpha;
          frame->frame_num_ = frame_num;
          frame->complete_ = (status == PARSE_OK);
          Skip(mem, payload_available);
@@ -297,10 +303,9 @@ static ParseStatus NewFrame(const MemBuffer* const mem,
 // 'frame_chunk_size' is the previously validated, padded chunk size.
 static ParseStatus ParseAnimationFrame(
    WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
  int added_frame = 0;
-  int bits;
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status =
@@ -312,10 +317,7 @@ static ParseStatus ParseAnimationFrame(
  frame->width_          = 1 + ReadLE24s(mem);
  frame->height_         = 1 + ReadLE24s(mem);
  frame->duration_       = ReadLE24s(mem);
-  bits = ReadByte(mem);
-  frame->dispose_method_ =
-      (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
-  frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
+  frame->dispose_method_ = (WebPMuxAnimDispose)(ReadByte(mem) & 1);
  if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
    free(frame);
    return PARSE_ERROR;
@@ -323,8 +325,9 @@ static ParseStatus ParseAnimationFrame(

  // Store a frame only if the animation flag is set there is some data for
  // this frame is available.
-  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
+  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame,
+                      NULL);
+  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
    added_frame = AddFrame(dmux, frame);
    if (added_frame) {
      ++dmux->num_frames_;
@@ -343,7 +346,7 @@ static ParseStatus ParseAnimationFrame(
 static ParseStatus ParseFragment(WebPDemuxer* const dmux,
                                 uint32_t fragment_chunk_size) {
  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
  int added_fragment = 0;
  MemBuffer* const mem = &dmux->mem_;
@@ -356,10 +359,10 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
  frame->x_offset_ = 2 * ReadLE24s(mem);
  frame->y_offset_ = 2 * ReadLE24s(mem);

-  // Store a fragment only if the 'fragments' flag is set and there is some
-  // data available.
-  status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
+  // Store a fragment only if the fragments flag is set there is some data for
+  // this fragment is available.
+  status = StoreFrame(frame_num, frgm_payload_size, mem, frame, NULL);
+  if (status != PARSE_ERROR && has_fragments && frame->frame_num_ > 0) {
    added_fragment = AddFrame(dmux, frame);
    if (!added_fragment) {
      status = PARSE_ERROR;
@@ -391,20 +394,20 @@ static int StoreChunk(WebPDemuxer* const dmux,
 // -----------------------------------------------------------------------------
 // Primary chunk parsing

-static ParseStatus ReadHeader(MemBuffer* const mem) {
+static int ReadHeader(MemBuffer* const mem) {
  const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
  uint32_t riff_size;

  // Basic file level validation.
-  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
+  if (MemDataSize(mem) < min_size) return 0;
  if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
      memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
-    return PARSE_ERROR;
+    return 0;
  }

  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
-  if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
-  if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (riff_size < CHUNK_HEADER_SIZE) return 0;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;

  // There's no point in reading past the end of the RIFF chunk
  mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
@@ -413,7 +416,7 @@ static ParseStatus ReadHeader(MemBuffer* const mem) {
  }

  Skip(mem, RIFF_HEADER_SIZE);
-  return PARSE_OK;
+  return 1;
 }

 static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
@@ -421,7 +424,7 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
  ParseStatus status;
-  int image_added = 0;
+  int has_vp8l_alpha = 0;  // Frame contains a lossless image with alpha.

  if (dmux->frames_ != NULL) return PARSE_ERROR;
  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
@@ -432,14 +435,14 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {

  // For the single image case we allow parsing of a partial frame, but we need
  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame,
+                      &has_vp8l_alpha);
  if (status != PARSE_ERROR) {
    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
    // Clear any alpha when the alpha flag is missing.
    if (!has_alpha && frame->img_components_[1].size_ > 0) {
      frame->img_components_[1].offset_ = 0;
      frame->img_components_[1].size_ = 0;
-      frame->has_alpha_ = 0;
    }

    // Use the frame width/height as the canvas values for non-vp8x files.
@@ -448,26 +451,47 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
      dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
      dmux->canvas_width_ = frame->width_;
      dmux->canvas_height_ = frame->height_;
-      dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
-    }
-    if (!AddFrame(dmux, frame)) {
-      status = PARSE_ERROR;  // last frame was left incomplete
-    } else {
-      image_added = 1;
-      dmux->num_frames_ = 1;
+      dmux->feature_flags_ |= has_vp8l_alpha ? ALPHA_FLAG : 0;
    }
+    AddFrame(dmux, frame);
+    dmux->num_frames_ = 1;
+  } else {
+    free(frame);
  }

-  if (!image_added) free(frame);
  return status;
 }

-static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
  MemBuffer* const mem = &dmux->mem_;
  int anim_chunks = 0;
+  uint32_t vp8x_size;
  ParseStatus status = PARSE_OK;

+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = ReadLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = ReadByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + ReadLE24s(mem);
+  dmux->canvas_height_ = 1 + ReadLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
  do {
    int store_chunk = 1;
    const size_t chunk_start_offset = mem->start_;
@@ -486,7 +510,7 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
      case MKFOURCC('V', 'P', '8', ' '):
      case MKFOURCC('V', 'P', '8', 'L'): {
        // check that this isn't an animation (all frames should be in an ANMF).
-        if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
+        if (anim_chunks > 0) return PARSE_ERROR;

        Rewind(mem, CHUNK_HEADER_SIZE);
        status = ParseSingleImage(dmux);
@@ -523,14 +547,14 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
        goto Skip;
      }
-      case MKFOURCC('E', 'X', 'I', 'F'): {
-        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
-        goto Skip;
-      }
      case MKFOURCC('X', 'M', 'P', ' '): {
        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
        goto Skip;
      }
+      case MKFOURCC('E', 'X', 'I', 'F'): {
+        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
+        goto Skip;
+      }
 Skip:
      default: {
        if (chunk_size_padded <= MemDataSize(mem)) {
@@ -559,37 +583,6 @@ static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
  return status;
 }

-static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
-  MemBuffer* const mem = &dmux->mem_;
-  uint32_t vp8x_size;
-
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  dmux->is_ext_format_ = 1;
-  Skip(mem, TAG_SIZE);  // VP8X
-  vp8x_size = ReadLE32(mem);
-  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
-  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
-  vp8x_size += vp8x_size & 1;
-  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
-  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
-
-  dmux->feature_flags_ = ReadByte(mem);
-  Skip(mem, 3);  // Reserved.
-  dmux->canvas_width_  = 1 + ReadLE24s(mem);
-  dmux->canvas_height_ = 1 + ReadLE24s(mem);
-  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
-    return PARSE_ERROR;  // image final dimension is too large
-  }
-  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
-  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
-
-  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  return ParseVP8XChunks(dmux);
-}
-
 // -----------------------------------------------------------------------------
 // Format validation

@@ -604,42 +597,18 @@ static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
  return 1;
 }

-// If 'exact' is true, check that the image resolution matches the canvas.
-// If 'exact' is false, check that the x/y offsets do not exceed the canvas.
-// TODO(jzern): this is insufficient in the fragmented image case if the
-// expectation is that the fragments completely cover the canvas.
-static int CheckFrameBounds(const Frame* const frame, int exact,
-                            int canvas_width, int canvas_height) {
-  if (exact) {
-    if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
-      return 0;
-    }
-    if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
-      return 0;
-    }
-  } else {
-    if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
-    if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
-    if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
-  }
-  return 1;
-}
-
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
-  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const Frame* f = dmux->frames_;
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const Frame* f;

  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;

  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
  if (dmux->loop_count_ < 0) return 0;
  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-#ifndef WEBP_EXPERIMENTAL_FEATURES
-  if (is_fragmented) return 0;
-#endif

-  while (f != NULL) {
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
    const int cur_frame_set = f->frame_num_;
    int frame_count = 0, fragment_count = 0;

@@ -649,10 +618,9 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
      const ChunkData* const image = f->img_components_;
      const ChunkData* const alpha = f->img_components_ + 1;

-      if (is_fragmented && !f->is_fragment_) return 0;
-      if (!is_fragmented && f->is_fragment_) return 0;
-      if (!is_animation && f->frame_num_ > 1) return 0;
-
+      if (!has_fragments && f->is_fragment_) return 0;
+      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
      if (f->complete_) {
        if (alpha->size_ == 0 && image->size_ == 0) return 0;
        // Ensure alpha precedes image bitstream.
@@ -674,17 +642,12 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
        if (f->next_ != NULL) return 0;
      }

-      if (f->width_ > 0 && f->height_ > 0 &&
-          !CheckFrameBounds(f, !(is_animation || is_fragmented),
-                            dmux->canvas_width_, dmux->canvas_height_)) {
-        return 0;
-      }
-
      fragment_count += f->is_fragment_;
      ++frame_count;
    }
-    if (!is_fragmented && frame_count > 1) return 0;
+    if (!has_fragments && frame_count > 1) return 0;
    if (fragment_count > 0 && frame_count != fragment_count) return 0;
+    if (f == NULL) break;
  }
  return 1;
 }
@@ -699,7 +662,6 @@ static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
  dmux->canvas_width_ = -1;
  dmux->canvas_height_ = -1;
  dmux->frames_tail_ = &dmux->frames_;
-  dmux->chunks_tail_ = &dmux->chunks_;
  dmux->mem_ = *mem;
 }

@@ -711,20 +673,11 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  MemBuffer mem;
  WebPDemuxer* dmux;

-  if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
-
  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;

  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
-  status = ReadHeader(&mem);
-  if (status != PARSE_OK) {
-    if (state != NULL) {
-      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
-                                                : WEBP_DEMUX_PARSE_ERROR;
-    }
-    return NULL;
-  }
+  if (!ReadHeader(&mem)) return NULL;

  partial = (mem.buf_size_ < mem.riff_end_);
  if (!allow_partial && partial) return NULL;
@@ -733,18 +686,16 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
  if (dmux == NULL) return NULL;
  InitDemux(dmux, &mem);

-  status = PARSE_ERROR;
  for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
    if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
      status = parser->parse(dmux);
      if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
      if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
-      if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
      break;
    }
  }
-  if (state != NULL) *state = dmux->state_;
+  if (state) *state = dmux->state_;

  if (status == PARSE_ERROR) {
    WebPDemuxDelete(dmux);
@@ -861,10 +812,8 @@ static int SynthesizeFrame(const WebPDemuxer* const dmux,
  iter->y_offset       = fragment->y_offset_;
  iter->width          = fragment->width_;
  iter->height         = fragment->height_;
-  iter->has_alpha      = fragment->has_alpha_;
  iter->duration       = fragment->duration_;
  iter->dispose_method = fragment->dispose_method_;
-  iter->blend_method   = fragment->blend_method_;
  iter->complete       = fragment->complete_;
  iter->fragment.bytes = payload;
  iter->fragment.size  = payload_size;
@@ -1000,3 +949,6 @@ void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
  (void)iter;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -17,6 +17,10 @@
 #include <cpu-features.h>
 #endif

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
@@ -78,3 +82,6 @@ VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -14,6 +14,10 @@
 #include "./dsp.h"
 #include "../dec/vp8i.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // run-time tables (~4k)

@@ -57,14 +61,6 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))

-#define STORE2(y, dc, d, c) do {    \
-  const int DC = (dc);              \
-  STORE(0, y, DC + (d));            \
-  STORE(1, y, DC + (c));            \
-  STORE(2, y, DC - (c));            \
-  STORE(3, y, DC - (d));            \
-} while (0)
-
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@@ -107,21 +103,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
    dst += BPS;
  }
 }
-
-// Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
-  const int a = in[0] + 4;
-  const int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
-  STORE2(0, a + d4, d1, c1);
-  STORE2(1, a + c4, d1, c1);
-  STORE2(2, a - c4, d1, c1);
-  STORE2(3, a - d4, d1, c1);
-}
 #undef MUL
-#undef STORE2

 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
@@ -697,7 +679,6 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------

 VP8DecIdct2 VP8Transform;
-VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@@ -725,7 +706,6 @@ void VP8DspInit(void) {
  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
  VP8TransformDCUV = TransformDCUV;
-  VP8TransformAC3 = TransformAC3;

  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
@@ -754,3 +734,6 @@ void VP8DspInit(void) {
  }
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -14,11 +14,15 @@

 #include "./dsp.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if defined(WEBP_USE_NEON)

 #include "../dec/vp8i.h"

-#define QRegs "q0", "q1", "q2", "q3",                                          \
+#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

 #define FLIP_SIGN_BIT2(a, b, s)                                                \
@@ -97,9 +101,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
-    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
+    "vld1.u8    {q4}, [%[p]]                   \n"  // q1

-    DO_FILTER2(q1, q2, q3, q12, %[thresh])
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])

    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride

@@ -118,18 +122,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride

    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
-    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
-    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
-    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
-    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
+    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
+    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
+    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4

-    DO_FILTER2(q1, q2, q12, q13, %[thresh])
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])

    "sub        %[p], %[p], #1                 \n"  // p - 1

-    "vswp        d5, d24                       \n"
+    "vswp        d5, d6                        \n"
    STORE8x2(d4, d5, [%[p]], %[stride])
-    STORE8x2(d24, d25, [%[p]], %[stride])
+    STORE8x2(d6, d7, [%[p]], %[stride])

    : [p] "+r"(p)
    : [stride] "r"(stride), [thresh] "r"(thresh)
@@ -156,7 +160,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
 //-----------------------------------------------------------------------------
 // Inverse transforms (Paragraph 14.4)

-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
  const int kBPS = BPS;
  const int16_t constants[] = {20091, 17734, 0, 0};
  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@@ -305,44 +309,13 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
  );
 }

-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOneNEON(in, dst);
  if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOneNEON(in + 16, dst + 4);
  }
 }

-static void TransformDC(const int16_t* in, uint8_t* dst) {
-  const int DC = (in[0] + 4) >> 3;
-  const int kBPS = BPS;
-  __asm__ volatile (
-    "vdup.16         q1, %[DC]        \n"
-
-    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
-
-    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
-
-    // add DC and convert to s16.
-    "vaddw.u8        q2, q1, d0                  \n"
-    "vaddw.u8        q3, q1, d1                  \n"
-    // convert back to u8 with saturation
-    "vqmovun.s16     d0,  q2                     \n"
-    "vqmovun.s16     d1,  q3                     \n"
-
-    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
-    "vst1.32         d1[1], [%[dst]]             \n"
-    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
-    : [kBPS] "r"(kBPS),   /* constants */
-      [DC] "r"(DC)
-    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
-  );
-}
-
 static void TransformWHT(const int16_t* in, int16_t* out) {
  const int kStep = 32;  // The store is only incrementing the pointer as if we
                         // had stored a single byte.
@@ -351,39 +324,39 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
    // load data into q0, q1
    "vld1.16         {q0, q1}, [%[in]]           \n"

-    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
-    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
-    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]

-    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
-    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
-    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2

    // Transpose
    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
    "vtrn.32         q0, q1                      \n"
    "vtrn.32         q2, q3                      \n"

-    "vmov.s32        q10, #3                     \n"  // dc = 3
-    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
-    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
-    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]

-    "vadd.s32        q0, q12, q13                \n"
-    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
    "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
-    "vsub.s32        q2, q12, q13                \n"
-    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
    "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3

    // set the results to output
    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
@@ -405,8 +378,8 @@ static void TransformWHT(const int16_t* in, int16_t* out) {

    : [out] "+r"(out)  // modified registers
    : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9"  // clobbered
  );
 }

@@ -419,9 +392,7 @@ extern void VP8DspInitNEON(void);

 void VP8DspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
-  VP8Transform = TransformTwo;
-  VP8TransformAC3 = TransformOne;  // no special code here
-  VP8TransformDC = TransformDC;
+  VP8Transform = TransformTwoNEON;
  VP8TransformWHT = TransformWHT;

  VP8SimpleVFilter16 = SimpleVFilter16NEON;
@@ -431,3 +402,6 @@ void VP8DspInitNEON(void) {
 #endif   // WEBP_USE_NEON
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@@ -14,11 +14,11 @@

 #include "./dsp.h"

-#if defined(WEBP_USE_SSE2)
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

-// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
-// one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
+#if defined(WEBP_USE_SSE2)

 #include <emmintrin.h>
 #include "../dec/vp8i.h"
@@ -201,16 +201,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
-      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
-      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
-      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
+      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
    } else {
      // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -230,66 +230,20 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
-      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
-      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
-      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
    } else {
      // Store four bytes/pixels per line.
-      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
    }
  }
 }

-#if defined(USE_TRANSFORM_AC3)
-#define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
-  static const int kC1 = 20091 + (1 << 16);
-  static const int kC2 = 35468;
-  const __m128i A = _mm_set1_epi16(in[0] + 4);
-  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
-  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
-  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
-  const __m128i B = _mm_adds_epi16(A, CD);
-  const __m128i m0 = _mm_adds_epi16(B, d4);
-  const __m128i m1 = _mm_adds_epi16(B, c4);
-  const __m128i m2 = _mm_subs_epi16(B, c4);
-  const __m128i m3 = _mm_subs_epi16(B, d4);
-  const __m128i zero = _mm_setzero_si128();
-  // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
-  // Convert to 16b.
-  dst0 = _mm_unpacklo_epi8(dst0, zero);
-  dst1 = _mm_unpacklo_epi8(dst1, zero);
-  dst2 = _mm_unpacklo_epi8(dst2, zero);
-  dst3 = _mm_unpacklo_epi8(dst3, zero);
-  // Add the inverse transform.
-  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
-  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
-  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
-  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
-  // Unsigned saturate to 8b.
-  dst0 = _mm_packus_epi16(dst0, dst0);
-  dst1 = _mm_packus_epi16(dst1, dst1);
-  dst2 = _mm_packus_epi16(dst2, dst2);
-  dst3 = _mm_packus_epi16(dst3, dst3);
-  // Store the results.
-  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
-}
-#undef MUL
-#endif   // USE_TRANSFORM_AC3
-
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)

@@ -934,9 +888,6 @@ extern void VP8DspInitSSE2(void);
 void VP8DspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  VP8Transform = TransformSSE2;
-#if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3SSE2;
-#endif

  VP8VFilter16 = VFilter16SSE2;
  VP8HFilter16 = HFilter16SSE2;
@@ -954,3 +905,6 @@ void VP8DspInitSSE2(void) {
 #endif   // WEBP_USE_SSE2
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -16,15 +16,14 @@

 #include "../webp/types.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

 //------------------------------------------------------------------------------
 // CPU detection

-#if defined(_MSC_VER) && _MSC_VER > 1310 && \
-    (defined(_M_X64) || defined(_M_IX86))
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif

@@ -86,11 +85,6 @@ typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;

-// specific to 2nd transform:
-typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
-                                   const struct VP8Matrix* const mtx);
-extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
-
 // Collect histogram for susceptibility calculation and accumulate in histo[].
 struct VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
@@ -108,7 +102,6 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
-extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
@@ -153,8 +146,6 @@ void VP8DspInit(void);

 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support

-// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
-// bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
    const uint8_t* top_y, const uint8_t* bottom_y,
    const uint8_t* top_u, const uint8_t* top_v,
@@ -217,7 +208,7 @@ void WebPInitPremultiplyNEON(void);

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -11,12 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include <assert.h>
 #include <stdlib.h>  // for abs()
-
 #include "./dsp.h"
 #include "../enc/vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
@@ -188,7 +190,7 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {

 static void FTransformWHT(const int16_t* in, int16_t* out) {
  // input is 12b signed
-  int32_t tmp[16];
+  int16_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
    const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
@@ -650,31 +652,6 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  int n, last = -1;
-  for (n = 0; n < 16; ++n) {
-    const int j = kZigzag[n];
-    const int sign = (in[j] < 0);
-    const int coeff = sign ? -in[j] : in[j];
-    assert(mtx->sharpen_[j] == 0);
-    if (coeff > mtx->zthresh_[j]) {
-      const int Q = mtx->q_[j];
-      const int iQ = mtx->iq_[j];
-      const int B = mtx->bias_[j];
-      out[n] = QUANTDIV(coeff, iQ, B);
-      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
-      if (sign) out[n] = -out[n];
-      in[j] = out[n] * Q;
-      if (out[n]) last = n;
-    } else {
-      out[n] = 0;
-      in[j] = 0;
-    }
-  }
-  return (last >= 0);
-}
-
 //------------------------------------------------------------------------------
 // Block copy

@@ -709,7 +686,6 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
-VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;

 extern void VP8EncDspInitSSE2(void);
@@ -734,7 +710,6 @@ void VP8EncDspInit(void) {
  VP8TDisto4x4 = Disto4x4;
  VP8TDisto16x16 = Disto16x16;
  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
  VP8Copy4x4 = Copy4x4;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@@ -751,3 +726,6 @@ void VP8EncDspInit(void) {
  }
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/enc_neon.c
+++ b/src/dsp/enc_neon.c
@@ -13,6 +13,10 @@

 #include "./dsp.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if defined(WEBP_USE_NEON)

 #include "../enc/vp8enci.h"
@@ -489,7 +493,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
    // q12/14 tmp[12-15]

    // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propagates them.
+    // case but the subtraction propegates them.
    "vswp            d3, d27                  \n"
    "vswp            d19, d31                 \n"

@@ -630,3 +634,6 @@ void VP8EncDspInitNEON(void) {
 #endif   // WEBP_USE_NEON
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -13,6 +13,10 @@

 #include "./dsp.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if defined(WEBP_USE_SSE2)
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
@@ -452,7 +456,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
 }

 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
-  int32_t tmp[16];
+  int16_t tmp[16];
  int i;
  for (i = 0; i < 4; ++i, in += 64) {
    const int a0 = (in[0 * 16] + in[2 * 16]);
@@ -465,22 +469,22 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
    tmp[3 + i * 4] = a0 - a1;
  }
  {
-    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
-    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
-    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
-    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
-    const __m128i a0 = _mm_add_epi32(src0, src2);
-    const __m128i a1 = _mm_add_epi32(src1, src3);
-    const __m128i a2 = _mm_sub_epi32(src1, src3);
-    const __m128i a3 = _mm_sub_epi32(src0, src2);
-    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
-    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
-    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
-    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
-    const __m128i out0 = _mm_packs_epi32(b0, b1);
-    const __m128i out1 = _mm_packs_epi32(b2, b3);
-    _mm_storeu_si128((__m128i*)&out[0], out0);
-    _mm_storeu_si128((__m128i*)&out[8], out1);
+    const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]);
+    const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]);
+    const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]);
+    const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]);
+    const __m128i a0 = _mm_add_epi16(src0, src2);
+    const __m128i a1 = _mm_add_epi16(src1, src3);
+    const __m128i a2 = _mm_sub_epi16(src1, src3);
+    const __m128i a3 = _mm_sub_epi16(src0, src2);
+    const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1);
+    const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1);
+    const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1);
+    const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1);
+    _mm_storel_epi64((__m128i*)&out[ 0], b0);
+    _mm_storel_epi64((__m128i*)&out[ 4], b1);
+    _mm_storel_epi64((__m128i*)&out[ 8], b2);
+    _mm_storel_epi64((__m128i*)&out[12], b3);
  }
 }

@@ -640,7 +644,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();

-  // Load, combine and transpose inputs.
+  // Load, combine and tranpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
@@ -826,6 +830,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
+  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  const __m128i sign0 = _mm_srai_epi16(in0, 15);
@@ -888,8 +894,17 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

-  _mm_storeu_si128((__m128i*)&in[0], in0);
-  _mm_storeu_si128((__m128i*)&in[8], in8);
+  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
+  {
+    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
+    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
+    in0 = _mm_and_si128(in0, cmp0);
+    in8 = _mm_and_si128(in8, cmp8);
+    _mm_storeu_si128((__m128i*)&in[0], in0);
+    _mm_storeu_si128((__m128i*)&in[8], in8);
+    out0 = _mm_and_si128(out0, cmp0);
+    out8 = _mm_and_si128(out8, cmp8);
+  }

  // zigzag the output before storing it.
  //
@@ -926,11 +941,6 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
  }
 }

-static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
-                                const VP8Matrix* const mtx) {
-  return QuantizeBlockSSE2(in, out, 0, mtx);
-}
-
 #endif   // WEBP_USE_SSE2

 //------------------------------------------------------------------------------
@@ -942,7 +952,6 @@ void VP8EncDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
  VP8CollectHistogram = CollectHistogramSSE2;
  VP8EncQuantizeBlock = QuantizeBlockSSE2;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
  VP8ITransform = ITransformSSE2;
  VP8FTransform = FTransformSSE2;
  VP8FTransformWHT = FTransformWHTSSE2;
@@ -955,3 +964,6 @@ void VP8EncDspInitSSE2(void) {
 #endif   // WEBP_USE_SSE2
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/lossless.c
+++ b/src/dsp/lossless.c
@@ -15,7 +15,14 @@

 #include "./dsp.h"

-#if defined(WEBP_USE_SSE2)
+// Define the following if target arch is sure to have SSE2
+// #define WEBP_TARGET_HAS_SSE2
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_TARGET_HAS_SSE2)
 #include <emmintrin.h>
 #endif

@@ -228,109 +235,6 @@ const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
 };

-const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
-  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
-  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
-  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
-  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
-  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-};
-
-const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
-   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
-   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
-  127,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
-};
-
 float VP8LFastSLog2Slow(int v) {
  assert(v >= LOG_LOOKUP_IDX_MAX);
  if (v < APPROX_LOG_MAX) {
@@ -383,6 +287,61 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
  return Average2(Average2(a0, a1), Average2(a2, a3));
 }

+#if defined(WEBP_TARGET_HAS_SSE2)
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i V1 = _mm_add_epi16(C0, C1);
+  const __m128i V2 = _mm_sub_epi16(V1, C2);
+  const __m128i b = _mm_packus_epi16(V2, V2);
+  const uint32_t output = _mm_cvtsi128_si32(b);
+  return output;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i A1 = _mm_sub_epi16(A0, B0);
+  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
+  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
+  const __m128i A3 = _mm_srai_epi16(A2, 1);
+  const __m128i A4 = _mm_add_epi16(A0, A3);
+  const __m128i A5 = _mm_packus_epi16(A4, A4);
+  const uint32_t output = _mm_cvtsi128_si32(A5);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int pa_minus_pb;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_cvtsi32_si128(a);
+  const __m128i B0 = _mm_cvtsi32_si128(b);
+  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i AC0 = _mm_subs_epu8(A0, C0);
+  const __m128i CA0 = _mm_subs_epu8(C0, A0);
+  const __m128i BC0 = _mm_subs_epu8(B0, C0);
+  const __m128i CB0 = _mm_subs_epu8(C0, B0);
+  const __m128i AC = _mm_or_si128(AC0, CA0);
+  const __m128i BC = _mm_or_si128(BC0, CB0);
+  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
+  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
+  const __m128i diff = _mm_sub_epi16(pb, pa);
+  {
+    int16_t out[8];
+    _mm_storeu_si128((__m128i*)out, diff);
+    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
+  }
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+#else
+
 static WEBP_INLINE uint32_t Clip255(uint32_t a) {
  if (a < 256) {
    return a;
@@ -437,6 +396,7 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
  return (pa_minus_pb <= 0) ? a : b;
 }
+#endif

 //------------------------------------------------------------------------------
 // Predictors
@@ -489,19 +449,18 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
+  const uint32_t pred = Select(top[0], left, top[-1]);
  return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  return pred;
 }

-// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
 typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
 static const PredictorFunc kPredictors[16] = {
  Predictor0, Predictor1, Predictor2, Predictor3,
@@ -757,8 +716,21 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
  }
 }

-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
  int i = 0;
+#if defined(WEBP_TARGET_HAS_SSE2)
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  for (; i + 4 < num_pixs; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+#endif
  for (; i < num_pixs; ++i) {
    const uint32_t argb = argb_data[i];
    const uint32_t green = (argb >> 8) & 0xff;
@@ -770,7 +742,23 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {

 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
+static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
+                                 int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  const uint32_t* const data_end = data + (y_end - y_start) * width;
+#if defined(WEBP_TARGET_HAS_SSE2)
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  for (; data + 4 < data_end; data += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)data);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_add_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)data, out);
+  }
+  // fallthrough and finish off with plain-C
+#endif
  while (data < data_end) {
    const uint32_t argb = *data;
    const uint32_t green = ((argb >> 8) & 0xff);
@@ -1168,18 +1156,18 @@ COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
 void VP8LInverseTransform(const VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out) {
-  const int width = transform->xsize_;
  assert(row_start < row_end);
  assert(row_end <= transform->ysize_);
  switch (transform->type_) {
    case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
+      AddGreenToBlueAndRed(transform, row_start, row_end, out);
      break;
    case PREDICTOR_TRANSFORM:
      PredictorInverseTransform(transform, row_start, row_end, out);
      if (row_end != transform->ysize_) {
        // The last predicted row in this iteration will be the top-pred row
        // for the first row in next iteration.
+        const int width = transform->xsize_;
        memcpy(out - width, out + (row_end - row_start - 1) * width,
               width * sizeof(*out));
      }
@@ -1194,7 +1182,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
        // Also, note that this is the only transform that applies on
        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
        // transforms work on effective width of xsize_.
-        const int out_stride = (row_end - row_start) * width;
+        const int out_stride = (row_end - row_start) * transform->xsize_;
        const int in_stride = (row_end - row_start) *
            VP8LSubSampleSize(transform->xsize_, transform->bits_);
        uint32_t* const src = out + out_stride - in_stride;
@@ -1394,139 +1382,6 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,

 //------------------------------------------------------------------------------

-// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
-// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
-#if defined(WEBP_USE_SSE2)
-static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i V1 = _mm_add_epi16(C0, C1);
-  const __m128i V2 = _mm_sub_epi16(V1, C2);
-  const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
-                                                       uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i A1 = _mm_sub_epi16(A0, B0);
-  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
-  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
-  const __m128i A3 = _mm_srai_epi16(A2, 1);
-  const __m128i A4 = _mm_add_epi16(A0, A3);
-  const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
-}
-
-static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
-  int pa_minus_pb;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
-  const __m128i AC0 = _mm_subs_epu8(A0, C0);
-  const __m128i CA0 = _mm_subs_epu8(C0, A0);
-  const __m128i BC0 = _mm_subs_epu8(B0, C0);
-  const __m128i CB0 = _mm_subs_epu8(C0, B0);
-  const __m128i AC = _mm_or_si128(AC0, CA0);
-  const __m128i BC = _mm_or_si128(BC0, CB0);
-  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
-  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
-  const __m128i diff = _mm_sub_epi16(pb, pa);
-  {
-    int16_t out[8];
-    _mm_storeu_si128((__m128i*)out, diff);
-    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
-  }
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
-  int i = 0;
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; i + 4 < num_pixs; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-  // fallthrough and finish off with plain-C
-  for (; i < num_pixs; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
-    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
-  }
-}
-
-static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; data + 4 < data_end; data += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)data);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)data, out);
-  }
-  // fallthrough and finish off with plain-C
-  while (data < data_end) {
-    const uint32_t argb = *data;
-    const uint32_t green = ((argb >> 8) & 0xff);
-    uint32_t red_blue = (argb & 0x00ff00ffu);
-    red_blue += (green << 16) | green;
-    red_blue &= 0x00ff00ffu;
-    *data++ = (argb & 0xff00ff00u) | red_blue;
-  }
-}
-
-extern void VP8LDspInitSSE2(void);
-
-void VP8LDspInitSSE2(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
-  VP8LSelect = SelectSSE2;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
-}
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
 #endif
-//------------------------------------------------------------------------------
-
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-VP8LPredSelectFunc VP8LSelect;
-VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
-
-void VP8LDspInit(void) {
-  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
-  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
-  VP8LSelect = Select;
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8LDspInitSSE2();
-    }
-#endif
-  }
-}
-
-//------------------------------------------------------------------------------
-
--- a/src/dsp/lossless.h
+++ b/src/dsp/lossless.h
@@ -18,30 +18,10 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//------------------------------------------------------------------------------
-//
-
-typedef uint32_t (*VP8LPredClampedAddSubFunc)(uint32_t c0, uint32_t c1,
-                                              uint32_t c2);
-typedef uint32_t (*VP8LPredSelectFunc)(uint32_t c0, uint32_t c1, uint32_t c2);
-typedef void (*VP8LSubtractGreenFromBlueAndRedFunc)(uint32_t* argb_data,
-                                                    int num_pixs);
-typedef void (*VP8LAddGreenToBlueAndRedFunc)(uint32_t* data_start,
-                                             const uint32_t* data_end);
-
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
-extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
-extern VP8LPredSelectFunc VP8LSelect;
-extern VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
-extern VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
-
-// Must be called before calling any of the above methods.
-void VP8LDspInit(void);
-
 //------------------------------------------------------------------------------
 // Image transforms.

@@ -62,6 +42,9 @@ void VP8LColorIndexInverseTransformAlpha(
    const struct VP8LTransform* const transform, int y_start, int y_end,
    const uint8_t* src, uint8_t* dst);

+// Subtracts green from blue and red channels.
+void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
+
 void VP8LResidualImage(int width, int height, int bits,
                       uint32_t* const argb, uint32_t* const argb_scratch,
                       uint32_t* const image);
@@ -89,8 +72,8 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-float VP8LFastLog2Slow(int v);
-float VP8LFastSLog2Slow(int v);
+extern float VP8LFastLog2Slow(int v);
+extern float VP8LFastSLog2Slow(int v);
 static WEBP_INLINE float VP8LFastLog2(int v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
@@ -99,105 +82,6 @@ static WEBP_INLINE float VP8LFastSLog2(int v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }

-// -----------------------------------------------------------------------------
-// PrefixEncode()
-
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  return 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
-      (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  _BitScanReverse(&first_set_bit, n);
-  return first_set_bit;
-}
-#else
-// Returns (int)floor(log2(n)). n must be > 0.
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
-
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return log_floor;
-  else
-    return log_floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
-                                                  int* const extra_bits) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
-                                              int* const extra_bits,
-                                              int* const extra_bits_value) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-#define PREFIX_LOOKUP_IDX_MAX   512
-typedef struct {
-  int8_t code_;
-  int8_t extra_bits_;
-} VP8LPrefixCode;
-
-// These tables are derived using VP8LPrefixEncodeNoLUT.
-extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
-extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
-static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
-                                             int* const extra_bits) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-  } else {
-    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
-  }
-}
-
-static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
-                                         int* const extra_bits,
-                                         int* const extra_bits_value) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
-  } else {
-    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
-  }
-}

 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@@ -213,7 +97,7 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@@ -14,7 +14,9 @@
 #include "./dsp.h"
 #include "./yuv.h"

-#include <assert.h>
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 //------------------------------------------------------------------------------
 // Fancy upsampler
@@ -43,12 +45,11 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  assert(top_y != NULL);                                                       \
-  {                                                                            \
+  if (top_y) {                                                                 \
    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
  }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if (bottom_y) {                                                              \
    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
  }                                                                            \
@@ -59,7 +60,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    {                                                                          \
+    if (top_y) {                                                               \
      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
@@ -67,7 +68,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
           top_dst + (2 * x - 0) * XSTEP);                                     \
    }                                                                          \
-    if (bottom_y != NULL) {                                                    \
+    if (bottom_y) {                                                            \
      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
@@ -79,12 +80,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
    l_uv = uv;                                                                 \
  }                                                                            \
  if (!(len & 1)) {                                                            \
-    {                                                                          \
+    if (top_y) {                                                               \
      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
           top_dst + (len - 1) * XSTEP);                                       \
    }                                                                          \
-    if (bottom_y != NULL) {                                                    \
+    if (bottom_y) {                                                            \
      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
           bottom_dst + (len - 1) * XSTEP);                                    \
@@ -167,8 +168,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
  const int half_len = len >> 1;                                               \
  int x;                                                                       \
-  assert(top_dst != NULL);                                                     \
-  {                                                                            \
+  if (top_dst != NULL) {                                                       \
    for (x = 0; x < half_len; ++x) {                                           \
      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
@@ -364,3 +364,6 @@ void WebPInitPremultiply(void) {
 #endif  // FANCY_UPSAMPLING
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/upsampling_neon.c
+++ b/src/dsp/upsampling_neon.c
@@ -14,6 +14,10 @@

 #include "./dsp.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if defined(WEBP_USE_NEON)

 #include <assert.h>
@@ -23,9 +27,6 @@

 #ifdef FANCY_UPSAMPLING

-//-----------------------------------------------------------------------------
-// U/V upsampling
-
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
 #define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
  uint8x8_t a = vld1_u8(r1);                                            \
@@ -84,90 +85,125 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
  Upsample16Pixels(r1, r2, out);                                        \
 }

-//-----------------------------------------------------------------------------
-// YUV->RGB conversion
+#define CY  76283
+#define CVR 89858
+#define CUG 22014
+#define CVG 45773
+#define CUB 113618

-static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
-
-#define v255 vmov_n_u8(255)
-
-#define STORE_Rgb(out, r, g, b) do {                                    \
-  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
-  vst3_u8(out, r_g_b);                                                  \
-} while (0)
-
-#define STORE_Bgr(out, r, g, b) do {                                    \
-  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
-  vst3_u8(out, b_g_r);                                                  \
-} while (0)
-
-#define STORE_Rgba(out, r, g, b) do {                                   \
-  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
-  vst4_u8(out, r_g_b_v255);                                             \
-} while (0)
-
-#define STORE_Bgra(out, r, g, b) do {                                   \
-  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
-  vst4_u8(out, b_g_r_v255);                                             \
-} while (0)
+static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };

 #define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
  int i;                                                                \
  for (i = 0; i < N; i += 8) {                                          \
-    const int off = ((cur_x) + i) * XSTEP;                              \
-    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
+    int off = ((cur_x) + i) * XSTEP;                                    \
+    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
-    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
-    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
-    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
-    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
-    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
-    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
-    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
-    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
-    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
-    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
-    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
-    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
-    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
-    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
-    /* vrshrn_n_s32() already incorporates the rounding constant */     \
-    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
-                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
-    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
-                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
-    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
-                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
-    STORE_ ## FMT(out + off, y, u, v);                                  \
+    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
+    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
+    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
+                                                                        \
+    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
+    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
+                                                                        \
+    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
+                                     vget_low_s16(vd),  cf16, 0);       \
+    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
+                                     vget_high_s16(vd), cf16, 0);       \
+    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
+                                vrshrn_n_s32(vrh, 16));                 \
+                                                                        \
+    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
+    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
+    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
+    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
+    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
+    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
+    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
+                                vrshrn_n_s32(gch, 16));                 \
+                                                                        \
+    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
+                                     vget_low_s16(ud),  cf16, 3);       \
+    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
+                                     vget_high_s16(ud), cf16, 3);       \
+    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
+                                vrshrn_n_s32(ubh, 16));                 \
+                                                                        \
+    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
+    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
+    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
+    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
+    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
+    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
+                                                                        \
+    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
+    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
+    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
+    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
+    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
+    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
+                                                                        \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
+                                 vrshrn_n_s32(rh, 16)));                \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
+                                 vrshrn_n_s32(gh, 16)));                \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
+                                 vrshrn_n_s32(bh, 16)));                \
+    STR_ ## FMT(out + off, y, u, v);                                    \
  }                                                                     \
 }

-#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
+#define v255 vmov_n_u8(255)
+
+#define STR_Rgb(out, r, g, b) do {                                      \
+  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
+  vst3_u8(out, r_g_b);                                                  \
+} while (0)
+
+#define STR_Bgr(out, r, g, b) do {                                      \
+  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
+  vst3_u8(out, b_g_r);                                                  \
+} while (0)
+
+#define STR_Rgba(out, r, g, b) do {                                     \
+  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
+  vst4_u8(out, r_g_b_v255);                                             \
+} while (0)
+
+#define STR_Bgra(out, r, g, b) do {                                     \
+  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
+  vst4_u8(out, b_g_r_v255);                                             \
+} while (0)
+
+#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
  int i;                                                                \
  for (i = 0; i < N; i++) {                                             \
-    const int off = ((cur_x) + i) * XSTEP;                              \
-    const int y = src_y[(cur_x) + i];                                   \
-    const int u = (src_uv)[i];                                          \
-    const int v = (src_uv)[i + 16];                                     \
-    FUNC(y, u, v, rgb + off);                                           \
+    int off = ((cur_x) + i) * XSTEP;                                    \
+    int y = src_y[(cur_x) + i];                                         \
+    int u = (src_uv)[i];                                                \
+    int v = (src_uv)[i + 16];                                           \
+    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
  }                                                                     \
 }

 #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
                      top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
-  if (bottom_y != NULL) {                                               \
+  if (top_y) {                                                          \
+    CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                \
+  }                                                                     \
+  if (bottom_y) {                                                       \
    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
  }                                                                     \
 }

-#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
+#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
                      top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
-  if (bottom_y != NULL) {                                               \
-    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
+  if (top_y) {                                                          \
+    CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);               \
+  }                                                                     \
+  if (bottom_y) {                                                       \
+    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
  }                                                                     \
 }

@@ -189,19 +225,18 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                        \
-  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
-  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
+  const int16x4_t cf16 = vld1_s16(coef);                                \
+  const int32x2_t cf32 = vmov_n_s32(CY);                                \
  const uint8x8_t u16  = vmov_n_u8(16);                                 \
  const uint8x8_t u128 = vmov_n_u8(128);                                \
                                                                        \
  /* Treat the first pixel in regular way */                            \
-  assert(top_y != NULL);                                                \
-  {                                                                     \
+  if (top_y) {                                                          \
    const int u0 = (top_u[0] + u_diag) >> 1;                            \
    const int v0 = (top_v[0] + v_diag) >> 1;                            \
    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
  }                                                                     \
-  if (bottom_y != NULL) {                                               \
+  if (bottom_y) {                                                       \
    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
@@ -220,7 +255,7 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
                                                                        \
  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
+  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
                top_dst, bottom_dst, last_pos, len - last_pos);         \
 }

@@ -236,8 +271,6 @@ NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)

 //------------------------------------------------------------------------------

-#ifdef FANCY_UPSAMPLING
-
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

 void WebPInitUpsamplersNEON(void) {
@@ -256,10 +289,6 @@ void WebPInitPremultiplyNEON(void) {
 #endif   // WEBP_USE_NEON
 }

-#else
-
-// this empty function is to avoid an empty .o
-void WebPInitPremultiplyNEON(void) {}
-
-#endif  // FANCY_UPSAMPLING
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -13,6 +13,10 @@

 #include "./dsp.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if defined(WEBP_USE_SSE2)

 #include <assert.h>
@@ -47,7 +51,7 @@
  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
 } while (0)

-// pack and store two alternating pixel rows
+// pack and store two alterning pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
@@ -83,8 +87,8 @@
  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                               \
  /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
-  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
+  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
+  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
 }

 // Turn the macro into a function for reducing code-size when non-critical
@@ -104,68 +108,69 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
  Upsample32Pixels(r1, r2, out);                                               \
 }

-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
  int n;                                                                       \
-  for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
-         top_dst + ((cur_x) + n) * XSTEP);                                     \
-  }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if (top_y) {                                                                 \
    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
+      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
+           top_dst + ((cur_x) + n) * XSTEP);                                   \
+    }                                                                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    for (n = 0; n < (num_pixels); ++n) {                                       \
+      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
           bottom_dst + ((cur_x) + n) * XSTEP);                                \
    }                                                                          \
  }                                                                            \
 }

-#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
-                       top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
-  if (bottom_y != NULL) {                                                      \
-    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
-             bottom_dst + (cur_x) * XSTEP);                                    \
-  }                                                                            \
-} while (0)
-
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* cur_u, const uint8_t* cur_v,              \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int uv_pos, pos;                                                             \
-  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  int block;                                                                   \
+  /* 16 byte aligned array to cache reconstructed u and v */                   \
  uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
-  uint8_t* const r_v = r_u + 32;                                               \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
+  const int uv_len = (len + 1) >> 1;                                           \
+  /* 17 pixels must be read-able for each block */                             \
+  const int num_blocks = (uv_len - 1) >> 4;                                    \
+  const int leftover = uv_len - num_blocks * 16;                               \
+  const int last_pos = 1 + 32 * num_blocks;                                    \
                                                                               \
-  assert(top_y != NULL);                                                       \
-  {   /* Treat the first pixel in regular way */                               \
-    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
-    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
-    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
-    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
-    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
-    if (bottom_y != NULL) {                                                    \
-      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
-      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
-      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
-    }                                                                          \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
+                                                                               \
+  assert(len > 0);                                                             \
+  /* Treat the first pixel in regular way */                                   \
+  if (top_y) {                                                                 \
+    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
+    FUNC(top_y[0], u0, v0, top_dst);                                           \
  }                                                                            \
-  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
-  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
-    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
-    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
-    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  if (bottom_y) {                                                              \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
+    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
  }                                                                            \
-  if (len > 1) {                                                               \
-    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
-    assert(left_over > 0);                                                     \
-    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
-    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
-                pos, len - pos);                                               \
+                                                                               \
+  for (block = 0; block < num_blocks; ++block) {                               \
+    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
+    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
+                32 * block + 1, 32)                                            \
+    top_u += 16;                                                               \
+    cur_u += 16;                                                               \
+    top_v += 16;                                                               \
+    cur_v += 16;                                                               \
  }                                                                            \
+                                                                               \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
+  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
+              last_pos, len - last_pos);                                       \
 }

 // SSE2 variants of the fancy upsampler.
@@ -179,7 +184,6 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
-#undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC

 #endif  // FANCY_UPSAMPLING
@@ -188,13 +192,10 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)

 //------------------------------------------------------------------------------

-#ifdef FANCY_UPSAMPLING
-
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

 void WebPInitUpsamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
-  VP8YUVInitSSE2();
  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
@@ -209,10 +210,8 @@ void WebPInitPremultiplySSE2(void) {
 #endif   // WEBP_USE_SSE2
 }

-#else
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif

-// this empty function is to avoid an empty .o
-void WebPInitPremultiplySSE2(void) {}
-
-#endif  // FANCY_UPSAMPLING

--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -13,8 +13,16 @@

 #include "./yuv.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

-#if defined(WEBP_YUV_USE_TABLE)
+#ifdef WEBP_YUV_USE_TABLE
+
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

 static int done = 0;

@@ -22,11 +30,6 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
  return v < 0 ? 0 : v > max_value ? max_value : v;
 }

-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
 void VP8YUVInit(void) {
  int i;
  if (done) {
@@ -67,141 +70,6 @@ void VP8YUVInit(void) {}

 #endif  // WEBP_YUV_USE_TABLE

-//-----------------------------------------------------------------------------
-// SSE2 extras
-
-#if defined(WEBP_USE_SSE2)
-
-#ifdef FANCY_UPSAMPLING
-
-#include <emmintrin.h>
-#include <string.h>   // for memcpy
-
-typedef union {   // handy struct for converting SSE2 registers
-  int32_t i32[4];
-  uint8_t u8[16];
-  __m128i m;
-} VP8kCstSSE2;
-
-static int done_sse2 = 0;
-static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
-
-void VP8YUVInitSSE2(void) {
-  if (!done_sse2) {
-    int i;
-    for (i = 0; i < 256; ++i) {
-      VP8kYtoRGBA[i].i32[0] =
-        VP8kYtoRGBA[i].i32[1] =
-        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
-      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
-
-      VP8kUtoRGBA[i].i32[0] = 0;
-      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
-      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
-      VP8kUtoRGBA[i].i32[3] = 0;
-
-      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
-      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
-      VP8kVtoRGBA[i].i32[2] = 0;
-      VP8kVtoRGBA[i].i32[3] = 0;
-    }
-    done_sse2 = 1;
-  }
-}
-
-static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
-  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
-  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
-  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
-  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
-  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
-  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
-  return rgba2;
-}
-
-static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const rgb) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
-  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)rgb, tmp2);
-}
-
-static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                        uint8_t* const bgr) {
-  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
-  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
-  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)bgr, tmp3);
-}
-
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 4) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
-    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
-    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
-    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
-    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
-    _mm_storeu_si128((__m128i*)dst, tmp2);
-    dst += 4 * 4;
-  }
-}
-
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
-  int n;
-  for (n = 0; n < 32; n += 2) {
-    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
-  }
-}
-
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
-    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
-  }
-  // Last two pixels are special: we write in a tmp buffer before sending
-  // to dst.
-  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
-}
-
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
-  int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {
-    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
-  }
-  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
-}
-
-#else
-
-void VP8YUVInitSSE2(void) {}
-
-#endif  // FANCY_UPSAMPLING
-
-#endif  // WEBP_USE_SSE2
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -14,7 +14,7 @@
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
+// We use 16bit fixed point operations for RGB->YUV conversion.
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
@@ -23,24 +23,21 @@
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
 // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should read:
+// So in this case the formulae should be read as:
 //   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
 //   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
 //   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized.
-// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
-// That's the maximum possible for a convenient ARM implementation.
+// once factorized. Here too, 16bit fixed precision is used.
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_

-#include "./dsp.h"
 #include "../dec/decode_vp8.h"

 // Define the following to use the LUT-based code:
-// #define WEBP_YUV_USE_TABLE
+#define WEBP_YUV_USE_TABLE

 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
@@ -55,111 +52,25 @@
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-enum {
-  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
-  YUV_HALF = 1 << (YUV_FIX - 1),
-  YUV_MASK = (256 << YUV_FIX) - 1,
-  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
-
-  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
-  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
-  YUV_MASK2 = (256 << YUV_FIX2) - 1
+enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_HALF = 1 << (YUV_FIX - 1),
+       YUV_MASK = (256 << YUV_FIX) - 1,
+       YUV_RANGE_MIN = -227,        // min value of r/g/b output
+       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
 };

-// These constants are 14b fixed-point version of ITU-R BT.601 constants.
-#define kYScale 19077    // 1.164 = 255 / 219
-#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
-#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
-#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
-#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
-#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
-#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
-#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
-
-//------------------------------------------------------------------------------
-
-#if !defined(WEBP_YUV_USE_TABLE)
-
-// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
-
-static WEBP_INLINE int VP8Clip8(int v) {
-  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
-}
-
-static WEBP_INLINE int VP8YUVToR(int y, int v) {
-  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
-}
-
-static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
-  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
-}
-
-static WEBP_INLINE int VP8YUVToB(int y, int u) {
-  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
-}
-
-static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
-                                    uint8_t* const rgb) {
-  rgb[0] = VP8YUVToR(y, v);
-  rgb[1] = VP8YUVToG(y, u, v);
-  rgb[2] = VP8YUVToB(y, u);
-}
-
-static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
-                                    uint8_t* const bgr) {
-  bgr[0] = VP8YUVToB(y, u);
-  bgr[1] = VP8YUVToG(y, u, v);
-  bgr[2] = VP8YUVToR(y, v);
-}
-
-static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
-                                       uint8_t* const rgb) {
-  const int r = VP8YUVToR(y, v);      // 5 usable bits
-  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
-  const int b = VP8YUVToB(y, u);      // 5 usable bits
-  const int rg = (r & 0xf8) | (g >> 5);
-  const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
-  rgb[0] = gb;
-  rgb[1] = rg;
-#else
-  rgb[0] = rg;
-  rgb[1] = gb;
-#endif
-}
-
-static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
-                                         uint8_t* const argb) {
-  const int r = VP8YUVToR(y, v);        // 4 usable bits
-  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
-  const int b = VP8YUVToB(y, u);        // 4 usable bits
-  const int rg = (r & 0xf0) | (g >> 4);
-  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
-  argb[0] = ba;
-  argb[1] = rg;
-#else
-  argb[0] = rg;
-  argb[1] = ba;
-#endif
-}
-
-#else
-
-// Table-based version, not totally equivalent to the SSE2 version.
-// Rounding diff is only +/-1 though.
+#ifdef WEBP_YUV_USE_TABLE

 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

-static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
                                    uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -169,7 +80,7 @@ static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }

-static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
                                    uint8_t* const bgr) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
@@ -179,15 +90,15 @@ static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }

-static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
                                       uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+  const uint8_t rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+                      (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  const uint8_t gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+                      (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
  rgb[1] = rg;
@@ -197,14 +108,94 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
 #endif
 }

-static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
                                         uint8_t* const argb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+  const uint8_t rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+                      VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  const uint8_t ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#else   // Table-free version (slower on x86)
+
+// These constants are 16b fixed-point version of ITU-R BT.601 constants
+#define kYScale 76309      // 1.164 = 255 / 219
+#define kVToR   104597     // 1.596 = 255 / 112 * 0.701
+#define kUToG   25674      // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   53278      // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   132201     // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
+
+static WEBP_INLINE uint8_t VP8Clip8(int v) {
+  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
+                                : (v < 0) ? 0u : 255u;
+}
+
+static WEBP_INLINE uint8_t VP8ClipN(int v, int N) {  // clip to N bits
+  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
+                                : (v < 0) ? 0u : (255u >> (8 - N));
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+  return kYScale * y + kVToR * v + kRCst;
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+  return kYScale * y - kUToG * u - kVToG * v + kGCst;
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+  return kYScale * y  + kUToB * u + kBCst;
+}
+
+static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const rgb) {
+  rgb[0] = VP8Clip8(VP8YUVToR(y, v));
+  rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
+  rgb[2] = VP8Clip8(VP8YUVToB(y, u));
+}
+
+static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const bgr) {
+  bgr[0] = VP8Clip8(VP8YUVToB(y, u));
+  bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
+  bgr[2] = VP8Clip8(VP8YUVToR(y, v));
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                       uint8_t* const rgb) {
+  const int r = VP8Clip8(VP8YUVToR(y, u));
+  const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
+  const int b = VP8ClipN(VP8YUVToB(y, v), 5);
+  const uint8_t rg = (r & 0xf8) | (g >> 3);
+  const uint8_t gb = (g << 5) | b;
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r = VP8Clip8(VP8YUVToR(y, u));
+  const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
+  const int b = VP8Clip8(VP8YUVToB(y, v));
+  const uint8_t rg = (r & 0xf0) | g;
+  const uint8_t ba = b | 0x0f;   // overwrite the lower 4 bits
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
@@ -216,9 +207,6 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,

 #endif  // WEBP_YUV_USE_TABLE

-//-----------------------------------------------------------------------------
-// Alpha handling variants
-
 static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const argb) {
  argb[0] = 0xff;
@@ -240,77 +228,56 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);

-//-----------------------------------------------------------------------------
-// SSE2 extra functions (mostly for upsampling_sse2.c)
-
-#if defined(WEBP_USE_SSE2)
-
-#if defined(FANCY_UPSAMPLING)
-// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-#endif  // FANCY_UPSAMPLING
-
-// Must be called to initialize tables before using the functions.
-void VP8YUVInitSSE2(void);
-
-#endif    // WEBP_USE_SSE2
-
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion

-// Stub functions that can be called with various rounding values:
-static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
-  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
-  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
+static WEBP_INLINE int VP8ClipUV(int v) {
+  v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
+  return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
 }

 #ifndef USE_YUVj

-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
  const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
 }

-static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
  const int u = -9719 * r - 19081 * g + 28800 * b;
-  return VP8ClipUV(u, rounding);
+  return VP8ClipUV(u);
 }

-static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
  const int v = +28800 * r - 24116 * g - 4684 * b;
-  return VP8ClipUV(v, rounding);
+  return VP8ClipUV(v);
 }

 #else

 // This JPEG-YUV colorspace, only for comparison!
-// These are also 16bit precision coefficients from Rec.601, but with full
+// These are also 16-bit precision coefficients from Rec.601, but with full
 // [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
+  const int kRound = (1 << (YUV_FIX - 1));
  const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + rounding) >> YUV_FIX;  // no need to clip
+  return (luma + kRound) >> YUV_FIX;  // no need to clip
 }

-static WEBP_INLINE int VP8_RGB_TO_U(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
  const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u, rounding);
+  return VP8ClipUV(u);
 }

-static WEBP_INLINE int VP8_RGB_TO_V(int r, int g, int b, int rounding) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
  const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v, rounding);
+  return VP8ClipUV(v);
 }

 #endif    // USE_YUVj

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -19,6 +19,10 @@
 #include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
 // The pre-processing (quantization) is performed if 'quality' is less than 100.
@@ -67,7 +71,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
    const uint8_t* src = data;
    for (j = 0; j < picture.height; ++j) {
      for (i = 0; i < picture.width; ++i) {
-        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
+        dst[i] = (src[i] << 8) | 0xff000000u;
      }
      src += width;
      dst += picture.argb_stride;
@@ -77,10 +81,8 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
  WebPConfigInit(&config);
  config.lossless = 1;
  config.method = effort_level;  // impact is very small
-  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
-  // lower methods (3 and below) is less than the threshold for triggering
-  // costly 'BackwardReferencesTraceBackwards'.
-  config.quality = 8.f * effort_level;
+  // Set a moderate default quality setting for alpha.
+  config.quality = 10.f * effort_level;
  assert(config.quality >= 0 && config.quality <= 100.f);

  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
@@ -97,19 +99,12 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,

 // -----------------------------------------------------------------------------

-// Small struct to hold the result of a filter mode compression attempt.
-typedef struct {
-  size_t score;
-  VP8BitWriter bw;
-  WebPAuxStats stats;
-} FilterTrial;
-
-// This function always returns an initialized 'bw' object, even upon error.
 static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                               int method, int filter, int reduce_levels,
                               int effort_level,  // in [0..6] range
                               uint8_t* const tmp_alpha,
-                               FilterTrial* result) {
+                               VP8BitWriter* const bw,
+                               WebPAuxStats* const stats) {
  int ok = 0;
  const uint8_t* alpha_src;
  WebPFilterFunc filter_func;
@@ -130,8 +125,8 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
  header = method | (filter << 2);
  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;

-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  VP8BitWriterInit(bw, expected_size);
+  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);

  filter_func = WebPFilters[filter];
  if (filter_func != NULL) {
@@ -142,14 +137,12 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
  }

  if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
+    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
+    ok = ok && !bw->error_;
  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
+    VP8BitWriterFinish(bw);
  }
-  result->score = VP8BitWriterSize(&result->bw);
  return ok;
 }

@@ -184,85 +177,6 @@ static int GetNumColors(const uint8_t* data, int width, int height,
  return colors;
 }

-#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
-#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
-
-// Given the input 'filter' option, return an OR'd bit-set of filters to try.
-static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
-                             int filter, int effort_level) {
-  uint32_t bit_map = 0U;
-  if (filter == WEBP_FILTER_FAST) {
-    // Quick estimate of the best candidate.
-    int try_filter_none = (effort_level > 3);
-    const int kMinColorsForFilterNone = 16;
-    const int kMaxColorsForFilterNone = 192;
-    const int num_colors = GetNumColors(alpha, width, height, width);
-    // For low number of colors, NONE yields better compression.
-    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-             EstimateBestFilter(alpha, width, height, width);
-    bit_map |= 1 << filter;
-    // For large number of colors, try FILTER_NONE in addition to the best
-    // filter as well.
-    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
-      bit_map |= FILTER_TRY_NONE;
-    }
-  } else if (filter == WEBP_FILTER_NONE) {
-    bit_map = FILTER_TRY_NONE;
-  } else {  // WEBP_FILTER_BEST -> try all
-    bit_map = FILTER_TRY_ALL;
-  }
-  return bit_map;
-}
-
-static void InitFilterTrial(FilterTrial* const score) {
-  score->score = (size_t)~0U;
-  VP8BitWriterInit(&score->bw, 0);
-}
-
-static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
-                                 size_t data_size, int method, int filter,
-                                 int reduce_levels, int effort_level,
-                                 uint8_t** const output,
-                                 size_t* const output_size,
-                                 WebPAuxStats* const stats) {
-  int ok = 1;
-  FilterTrial best;
-  uint32_t try_map =
-      GetFilterMap(alpha, width, height, filter, effort_level);
-  InitFilterTrial(&best);
-  if (try_map != FILTER_TRY_NONE) {
-    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
-    if (filtered_alpha == NULL) return 0;
-
-    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
-      if (try_map & 1) {
-        FilterTrial trial;
-        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
-                                 reduce_levels, effort_level, filtered_alpha,
-                                 &trial);
-        if (ok && trial.score < best.score) {
-          VP8BitWriterWipeOut(&best.bw);
-          best = trial;
-        } else {
-          VP8BitWriterWipeOut(&trial.bw);
-        }
-      }
-    }
-    free(filtered_alpha);
-  } else {
-    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
-                             reduce_levels, effort_level, NULL, &best);
-  }
-  if (ok) {
-    if (stats != NULL) *stats = best.stats;
-    *output_size = VP8BitWriterSize(&best.bw);
-    *output = VP8BitWriterBuf(&best.bw);
-  } else {
-    VP8BitWriterWipeOut(&best.bw);
-  }
-  return ok;
-}
-
 static int EncodeAlpha(VP8Encoder* const enc,
                       int quality, int method, int filter,
                       int effort_level,
@@ -293,11 +207,6 @@ static int EncodeAlpha(VP8Encoder* const enc,
    return 0;
  }

-  if (method == ALPHA_NO_COMPRESSION) {
-    // Don't filter, as filtering will make no impact on compressed size.
-    filter = WEBP_FILTER_NONE;
-  }
-
  quant_alpha = (uint8_t*)malloc(data_size);
  if (quant_alpha == NULL) {
    return 0;
@@ -316,19 +225,105 @@ static int EncodeAlpha(VP8Encoder* const enc,
  }

  if (ok) {
-    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
-                               filter, reduce_levels, effort_level, output,
-                               output_size, pic->stats);
-    if (pic->stats != NULL) {  // need stats?
-      pic->stats->coded_size += (int)(*output_size);
-      enc->sse_[3] = sse;
-    }
-  }
+    VP8BitWriter bw;
+    int test_filter;
+    uint8_t* filtered_alpha = NULL;
+    int try_filter_none = (effort_level > 3);

+    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of the best candidate.
+      const int kMinColorsForFilterNone = 16;
+      const int kMaxColorsForFilterNone = 192;
+      const int num_colors = GetNumColors(quant_alpha, width, height, width);
+      // For low number of colors, NONE yeilds better compression.
+      filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
+               EstimateBestFilter(quant_alpha, width, height, width);
+      // For large number of colors, try FILTER_NONE in addition to the best
+      // filter as well.
+      if (num_colors > kMaxColorsForFilterNone) {
+        try_filter_none = 1;
+      }
+    }
+
+    // Test for WEBP_FILTER_NONE for higher effort levels.
+    if (try_filter_none || filter == WEBP_FILTER_NONE) {
+      ok = EncodeAlphaInternal(quant_alpha, width, height,
+                               method, WEBP_FILTER_NONE, reduce_levels,
+                               effort_level, NULL, &bw, pic->stats);
+
+      if (!ok) {
+        VP8BitWriterWipeOut(&bw);
+        goto End;
+      }
+    }
+    // Stop?
+    if (filter == WEBP_FILTER_NONE) {
+      goto Ok;
+    }
+
+    filtered_alpha = (uint8_t*)malloc(data_size);
+    ok = (filtered_alpha != NULL);
+    if (!ok) {
+      goto End;
+    }
+
+    // Try the other mode(s).
+    {
+      WebPAuxStats best_stats;
+      size_t best_score = try_filter_none ?
+                          VP8BitWriterSize(&bw) : (size_t)~0U;
+      int wipe_tmp_bw = try_filter_none;
+
+      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
+      if (pic->stats != NULL) best_stats = *pic->stats;
+      for (test_filter =
+           try_filter_none ? WEBP_FILTER_HORIZONTAL : WEBP_FILTER_NONE;
+           ok && (test_filter <= WEBP_FILTER_GRADIENT);
+           ++test_filter) {
+        VP8BitWriter tmp_bw;
+        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
+          continue;
+        }
+        ok = EncodeAlphaInternal(quant_alpha, width, height,
+                                 method, test_filter, reduce_levels,
+                                 effort_level, filtered_alpha, &tmp_bw,
+                                 pic->stats);
+        if (ok) {
+          const size_t score = VP8BitWriterSize(&tmp_bw);
+          if (score < best_score) {
+            // swap bitwriter objects.
+            VP8BitWriter tmp = tmp_bw;
+            tmp_bw = bw;
+            bw = tmp;
+            best_score = score;
+            if (pic->stats != NULL) best_stats = *pic->stats;
+          }
+        } else {
+          VP8BitWriterWipeOut(&bw);
+        }
+        if (wipe_tmp_bw) {
+          VP8BitWriterWipeOut(&tmp_bw);
+        }
+        wipe_tmp_bw = 1;  // For next filter trial for WEBP_FILTER_BEST.
+      }
+      if (pic->stats != NULL) *pic->stats = best_stats;
+    }
+ Ok:
+    if (ok) {
+      *output_size = VP8BitWriterSize(&bw);
+      *output = VP8BitWriterBuf(&bw);
+      if (pic->stats != NULL) {         // need stats?
+        pic->stats->coded_size += (int)(*output_size);
+        enc->sse_[3] = sse;
+      }
+    }
+    free(filtered_alpha);
+  }
+ End:
  free(quant_alpha);
  return ok;
 }

+
 //------------------------------------------------------------------------------
 // Main calls

@@ -408,3 +403,6 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
  return ok;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -19,6 +19,10 @@
 #include "./cost.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define MAX_ITERS_K_MEANS  6

 //------------------------------------------------------------------------------
@@ -51,7 +55,6 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
        if (cnt[n] >= majority_cnt_3_x_3_grid) {
          majority_seg = n;
-          break;
        }
      }
      tmp[x + y * w] = majority_seg;
@@ -150,8 +153,6 @@ static void AssignSegments(VP8Encoder* const enc,
  // 'int' type is ok for histo, and won't overflow
  int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];

-  assert(nb >= 1);
-
  // bracket the input
  for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
  min_a = n;
@@ -160,9 +161,8 @@ static void AssignSegments(VP8Encoder* const enc,
  range_a = max_a - min_a;

  // Spread initial centers evenly
-  for (k = 0, n = 1; k < nb; ++k, n += 2) {
-    assert(n < 2 * nb);
-    centers[k] = min_a + (n * range_a) / (2 * nb);
+  for (n = 1, k = 0; n < 2 * nb; n += 2) {
+    centers[k++] = min_a + (n * range_a) / (2 * nb);
  }

  for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@@ -177,7 +177,7 @@ static void AssignSegments(VP8Encoder* const enc,
    n = 0;    // track the nearest center for current 'a'
    for (a = min_a; a <= max_a; ++a) {
      if (alphas[a]) {
-        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
          n++;
        }
        map[a] = n;
@@ -384,114 +384,38 @@ static void ResetAllMBInfo(VP8Encoder* const enc) {
  // Default susceptibilities.
  enc->dqm_[0].alpha_ = 0;
  enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
-  enc->alpha_ = 0;
-  enc->uv_alpha_ = 0;
+  // Note: we can't compute this alpha_ / uv_alpha_.
  WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }

-// struct used to collect job result
-typedef struct {
-  WebPWorker worker;
-  int alphas[MAX_ALPHA + 1];
-  int alpha, uv_alpha;
-  VP8EncIterator it;
-  int delta_progress;
-} SegmentJob;
-
-// main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
-  int ok = 1;
-  if (!VP8IteratorIsDone(it)) {
-    uint8_t tmp[32 + ALIGN_CST];
-    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
-    do {
-      // Let's pretend we have perfect lossless reconstruction.
-      VP8IteratorImport(it, scratch);
-      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
-      ok = VP8IteratorProgress(it, job->delta_progress);
-    } while (ok && VP8IteratorNext(it));
-  }
-  return ok;
-}
-
-static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
-  int i;
-  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
-  dst->alpha += src->alpha;
-  dst->uv_alpha += src->uv_alpha;
-}
-
-// initialize the job struct with some TODOs
-static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
-                           int start_row, int end_row) {
-  WebPWorkerInit(&job->worker);
-  job->worker.data1 = job;
-  job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
-  VP8IteratorInit(enc, &job->it);
-  VP8IteratorSetRow(&job->it, start_row);
-  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
-  memset(job->alphas, 0, sizeof(job->alphas));
-  job->alpha = 0;
-  job->uv_alpha = 0;
-  // only one of both jobs can record the progress, since we don't
-  // expect the user's hook to be multi-thread safe
-  job->delta_progress = (start_row == 0) ? 20 : 0;
-}
-
-// main entry point
 int VP8EncAnalyze(VP8Encoder* const enc) {
  int ok = 1;
  const int do_segments =
      enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
      (enc->segment_hdr_.num_segments_ > 1) ||
      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+  enc->alpha_ = 0;
+  enc->uv_alpha_ = 0;
  if (do_segments) {
-    const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
-    const int total_mb = last_row * enc->mb_w_;
-#ifdef WEBP_USE_THREAD
-    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
-    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
-#else
-    const int do_mt = 0;
-#endif
-    SegmentJob main_job;
-    if (do_mt) {
-      SegmentJob side_job;
-      // Note the use of '&' instead of '&&' because we must call the functions
-      // no matter what.
-      InitSegmentJob(enc, &main_job, 0, split_row);
-      InitSegmentJob(enc, &side_job, split_row, last_row);
-      // we don't need to call Reset() on main_job.worker, since we're calling
-      // WebPWorkerExecute() on it
-      ok &= WebPWorkerReset(&side_job.worker);
-      // launch the two jobs in parallel
-      if (ok) {
-        WebPWorkerLaunch(&side_job.worker);
-        WebPWorkerExecute(&main_job.worker);
-        ok &= WebPWorkerSync(&side_job.worker);
-        ok &= WebPWorkerSync(&main_job.worker);
-      }
-      WebPWorkerEnd(&side_job.worker);
-      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
-    } else {
-      // Even for single-thread case, we use the generic Worker tools.
-      InitSegmentJob(enc, &main_job, 0, last_row);
-      WebPWorkerExecute(&main_job.worker);
-      ok &= WebPWorkerSync(&main_job.worker);
-    }
-    WebPWorkerEnd(&main_job.worker);
-    if (ok) {
-      enc->alpha_ = main_job.alpha / total_mb;
-      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
-      AssignSegments(enc, main_job.alphas);
-    }
+    int alphas[MAX_ALPHA + 1] = { 0 };
+    VP8EncIterator it;
+
+    VP8IteratorInit(enc, &it);
+    do {
+      VP8IteratorImport(&it);
+      MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_);
+      ok = VP8IteratorProgress(&it, 20);
+      // Let's pretend we have perfect lossless reconstruction.
+    } while (ok && VP8IteratorNext(&it, it.yuv_in_));
+    enc->alpha_ /= enc->mb_w_ * enc->mb_h_;
+    enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
+    if (ok) AssignSegments(enc, alphas);
  } else {   // Use only one default segment.
    ResetAllMBInfo(enc);
  }
  return ok;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/backward_references.c
+++ b/src/enc/backward_references.c
@@ -156,14 +156,14 @@ static void GetParamsForHashChainFindCopy(int quality, int xsize,
  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
               : max_window_size;
  *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigorous search loop in HashChainFindCopy
+  // For lower entropy images, the rigourous search loop in HashChainFindCopy
  // can be relaxed.
  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
 }

 static int HashChainFindCopy(const HashChain* const p,
                             int base_position, int xsize_signed,
-                             const uint32_t* const argb, int max_len,
+                             const uint32_t* const argb, int maxlen,
                             int window_size, int iter_pos, int iter_limit,
                             int* const distance_ptr,
                             int* const length_ptr) {
@@ -176,34 +176,25 @@ static int HashChainFindCopy(const HashChain* const p,
      (base_position > window_size) ? base_position - window_size : 0;
  int pos;
  assert(xsize > 0);
-  if (max_len > MAX_LENGTH) {
-    max_len = MAX_LENGTH;
-  }
  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
       pos >= min_pos;
       pos = p->chain_[pos]) {
    uint64_t val;
    uint32_t curr_length;
    uint32_t distance;
-    const uint64_t* const ptr1 =
-        (const uint64_t*)(argb + pos + best_length - 1);
-    const uint64_t* const ptr2 =
-        (const uint64_t*)(argb_start + best_length - 1);
-
    if (iter_pos < 0) {
      if (iter_pos < iter_limit || best_val >= 0xff0000) {
        break;
      }
    }
    --iter_pos;
-
-    // Before 'expensive' linear match, check if the two arrays match at the
-    // current best length index and also for the succeeding elements.
-    if (*ptr1 != *ptr2) continue;
-
-    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
-    if (curr_length < best_length) continue;
-
+    if (argb[pos + best_length - 1] != argb_start[best_length - 1]) {
+      continue;
+    }
+    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
+    if (curr_length < best_length) {
+      continue;
+    }
    distance = (uint32_t)(base_position - pos);
    val = curr_length << 16;
    // Favoring 2d locality here gives savings for certain images.
@@ -222,7 +213,7 @@ static int HashChainFindCopy(const HashChain* const p,
      best_val = val;
      best_length = curr_length;
      best_distance = distance;
-      if (curr_length >= (uint32_t)max_len) {
+      if (curr_length >= MAX_LENGTH) {
        break;
      }
      if ((best_distance == 1 || distance == xsize) &&
@@ -300,8 +291,11 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
    int offset = 0;
    int len = 0;
    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int max_len = pix_count - i;
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+      int maxlen = pix_count - i;
+      if (maxlen > MAX_LENGTH) {
+        maxlen = MAX_LENGTH;
+      }
+      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
                        window_size, iter_pos, iter_limit,
                        &offset, &len);
    }
@@ -313,8 +307,11 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
      int k;
      HashChainInsert(hash_chain, &argb[i], i);
      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int max_len = pix_count - (i + 1);
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
+        int maxlen = pix_count - (i + 1);
+        if (maxlen > MAX_LENGTH) {
+          maxlen = MAX_LENGTH;
+        }
+        HashChainFindCopy(hash_chain, i + 1, xsize, argb, maxlen,
                          window_size, iter_pos, iter_limit,
                          &offset2, &len2);
        if (len2 > len + 1) {
@@ -324,10 +321,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
          } else {
-            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
          }
          ++refs->size;
+          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
          i++;  // Backward reference to be done for next pixel.
          len = len2;
          offset = offset2;
@@ -357,10 +354,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
      }
      ++refs->size;
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
      if (i + 1 < pix_count) {
        HashChainInsert(hash_chain, &argb[i], i);
      }
@@ -462,16 +459,16 @@ static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {

 static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                        uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
 }

 static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                          uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
+  int code, extra_bits_count, extra_bits_value;
+  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
+  return m->distance_[code] + extra_bits_count;
 }

 static int BackwardReferencesHashChainDistanceOnly(
@@ -525,8 +522,11 @@ static int BackwardReferencesHashChainDistanceOnly(
      int offset = 0;
      int len = 0;
      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int max_len = shortmax ? 2 : pix_count - i;
-        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+        int maxlen = shortmax ? 2 : MAX_LENGTH;
+        if (maxlen > pix_count - i) {
+          maxlen = pix_count - i;
+        }
+        HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
                          window_size, iter_pos, iter_limit,
                          &offset, &len);
      }
@@ -577,13 +577,13 @@ static int BackwardReferencesHashChainDistanceOnly(
        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
        cost_val += GetCacheCost(cost_model, ix) * mul0;
      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
      }
      if (cost[i] > cost_val) {
        cost[i] = (float)cost_val;
        dist_array[i] = 1;  // only one is inserted.
      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
    }
 next_symbol: ;
  }
@@ -650,12 +650,12 @@ static int BackwardReferencesHashChainFollowChosenPath(
  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
    int offset = 0;
    int len = 0;
-    int max_len = chosen_path[ix];
-    if (max_len != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
+    int maxlen = chosen_path[ix];
+    if (maxlen != 1) {
+      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
                        window_size, iter_pos, iter_limit,
                        &offset, &len);
-      assert(len == max_len);
+      assert(len == maxlen);
      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
      if (use_color_cache) {
        for (k = 0; k < len; ++k) {
@@ -675,9 +675,9 @@ static int BackwardReferencesHashChainFollowChosenPath(
        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
      }
+      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
      if (i + 1 < pix_count) {
        HashChainInsert(hash_chain, &argb[i], i);
      }
@@ -780,8 +780,8 @@ int VP8LGetBackwardReferences(int width, int height,

  // Choose appropriate backward reference.
  if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
+    // TraceBackwards is costly. Don't execute it at lower quality (q <= 10).
+    const int try_lz77_trace_backwards = (quality > 10);
    *best = refs_lz77;   // default guess: lz77 is better
    VP8LClearBackwardRefs(&refs_rle);
    if (try_lz77_trace_backwards) {
--- a/src/enc/backward_references.h
+++ b/src/enc/backward_references.h
@@ -18,7 +18,7 @@
 #include "../webp/types.h"
 #include "../webp/format_constants.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -30,6 +30,73 @@ extern "C" {
 #define PIX_OR_COPY_CODES_MAX \
    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))

+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  assert(n != 0);
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  assert(n != 0);
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  assert(n != 0);
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void PrefixEncode(int distance, int* const code,
+                                     int* const extra_bits_count,
+                                     int* const extra_bits_value) {
+  if (distance > 2) {  // Collect the two most significant bits.
+    const int highest_bit = BitsLog2Floor(--distance);
+    const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+    *extra_bits_count = highest_bit - 1;
+    *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
+    *code = 2 * highest_bit + second_highest_bit;
+  } else {
+    *extra_bits_count = 0;
+    *extra_bits_value = 0;
+    *code = (distance == 2) ? 1 : 0;
+  }
+}
+
 // -----------------------------------------------------------------------------
 // PixOrCopy

@@ -145,7 +212,7 @@ int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
                                      int xsize, int ysize,
                                      int* const best_cache_bits);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif

--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -13,6 +13,10 @@

 #include "../webp/encode.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // WebPConfig
 //------------------------------------------------------------------------------
@@ -29,7 +33,7 @@ int WebPConfigInitInternal(WebPConfig* config,
  config->target_PSNR = 0.;
  config->method = 4;
  config->sns_strength = 50;
-  config->filter_strength = 60;   // mid-filtering
+  config->filter_strength = 60;   // rather high filtering, helps w/ gradients.
  config->filter_sharpness = 0;
  config->filter_type = 1;        // default: strong (so U/V is filtered too)
  config->partitions = 0;
@@ -54,13 +58,11 @@ int WebPConfigInitInternal(WebPConfig* config,
      config->sns_strength = 80;
      config->filter_sharpness = 4;
      config->filter_strength = 35;
-      config->preprocessing &= ~2;   // no dithering
      break;
    case WEBP_PRESET_PHOTO:
      config->sns_strength = 80;
      config->filter_sharpness = 3;
      config->filter_strength = 30;
-      config->preprocessing |= 2;
      break;
    case WEBP_PRESET_DRAWING:
      config->sns_strength = 25;
@@ -70,12 +72,10 @@ int WebPConfigInitInternal(WebPConfig* config,
    case WEBP_PRESET_ICON:
      config->sns_strength = 0;
      config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
      break;
    case WEBP_PRESET_TEXT:
      config->sns_strength = 0;
      config->filter_strength = 0;   // disable filtering to retain sharpness
-      config->preprocessing &= ~2;   // no dithering
      config->segments = 2;
      break;
    case WEBP_PRESET_DEFAULT:
@@ -111,7 +111,7 @@ int WebPValidateConfig(const WebPConfig* config) {
    return 0;
  if (config->show_compressed < 0 || config->show_compressed > 1)
    return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 3)
+  if (config->preprocessing < 0 || config->preprocessing > 1)
    return 0;
  if (config->partitions < 0 || config->partitions > 3)
    return 0;
@@ -138,3 +138,6 @@ int WebPValidateConfig(const WebPConfig* config) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -13,6 +13,10 @@

 #include "./cost.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Boolean-cost cost table

@@ -383,107 +387,110 @@ const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
 // note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
-  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
-    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
-    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
-    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
-    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
-    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
-    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
-    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
-    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
-    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
-  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
-    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
-    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
-    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
-    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
-    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
-    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
-    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
-    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
-    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
-  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
-    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
-    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
-    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
-    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
-    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
-    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
-    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
-    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
-    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
-  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
-    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
-    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
-    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
-    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
-    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
-    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
-    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
-    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
-    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
-  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
-    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
-    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
-    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
-    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
-    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
-    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
-    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
-    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
-    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
-  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
-    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
-    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
-    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
-    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
-    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
-    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
-    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
-    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
-    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
-  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
-    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
-    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
-    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
-    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
-    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
-    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
-    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
-    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
-    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
-  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
-    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
-    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
-    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
-    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
-    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
-    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
-    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
-    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
-    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
-  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
-    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
-    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
-    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
-    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
-    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
-    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
-    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
-    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
-    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
-  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
-    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
-    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
-    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
-    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
-    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
-    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
-    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
-    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
-    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
+  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
+    {  403,  680, 1507, 1519, 2060, 2005, 1992, 1914, 1924, 1733 },
+    {  353, 1121,  973, 1895, 2060, 1787, 1671, 1516, 2012, 1868 },
+    {  770,  852, 1581,  632, 1393, 1780, 1823, 1936, 1074, 1218 },
+    {  510, 1270, 1467, 1319,  847, 1279, 1792, 2094, 1080, 1353 },
+    {  488, 1322,  918, 1573, 1300,  883, 1814, 1752, 1756, 1502 },
+    {  425,  992, 1820, 1514, 1843, 2440,  937, 1771, 1924, 1129 },
+    {  363, 1248, 1257, 1970, 2194, 2385, 1569,  953, 1951, 1601 },
+    {  723, 1257, 1631,  964,  963, 1508, 1697, 1824,  671, 1418 },
+    {  635, 1038, 1573,  930, 1673, 1413, 1410, 1687, 1410,  749 } },
+  { {  451,  613, 1345, 1702, 1870, 1716, 1728, 1766, 2190, 2310 },
+    {  678,  453, 1171, 1443, 1925, 1831, 2045, 1781, 1887, 1602 },
+    {  711,  666,  674, 1718, 1910, 1493, 1775, 1193, 2325, 2325 },
+    {  883,  854, 1583,  542, 1800, 1878, 1664, 2149, 1207, 1087 },
+    {  669,  994, 1248, 1122,  949, 1179, 1376, 1729, 1070, 1244 },
+    {  715, 1026,  715, 1350, 1430,  930, 1717, 1296, 1479, 1479 },
+    {  544,  841, 1656, 1450, 2094, 3883, 1010, 1759, 2076,  809 },
+    {  610,  855,  957, 1553, 2067, 1561, 1704,  824, 2066, 1226 },
+    {  833,  960, 1416,  819, 1277, 1619, 1501, 1617,  757, 1182 },
+    {  711,  964, 1252,  879, 1441, 1828, 1508, 1636, 1594,  734 } },
+  { {  605,  764,  734, 1713, 1747, 1192, 1819, 1353, 1877, 2392 },
+    {  866,  641,  586, 1622, 2072, 1431, 1888, 1346, 2189, 1764 },
+    {  901,  851,  456, 2165, 2281, 1405, 1739, 1193, 2183, 2443 },
+    {  770, 1045,  952, 1078, 1342, 1191, 1436, 1063, 1303,  995 },
+    {  901, 1086,  727, 1170,  884, 1105, 1267, 1401, 1739, 1337 },
+    {  951, 1162,  595, 1488, 1388,  703, 1790, 1366, 2057, 1724 },
+    {  534,  986, 1273, 1987, 3273, 1485, 1024, 1399, 1583,  866 },
+    {  699, 1182,  695, 1978, 1726, 1986, 1326,  714, 1750, 1672 },
+    {  951, 1217, 1209,  920, 1062, 1441, 1548,  999,  952,  932 },
+    {  733, 1284,  784, 1256, 1557, 1098, 1257, 1357, 1414,  908 } },
+  { {  316, 1075, 1653, 1220, 2145, 2051, 1730, 2131, 1884, 1790 },
+    {  745,  516, 1404,  894, 1599, 2375, 2013, 2105, 1475, 1381 },
+    {  516,  729, 1088, 1319, 1637, 3426, 1636, 1275, 1531, 1453 },
+    {  894,  943, 2138,  468, 1704, 2259, 2069, 1763, 1266, 1158 },
+    {  605, 1025, 1235,  871, 1170, 1767, 1493, 1500, 1104, 1258 },
+    {  739,  826, 1207, 1151, 1412,  846, 1305, 2726, 1014, 1569 },
+    {  558,  825, 1820, 1398, 3344, 1556, 1218, 1550, 1228,  878 },
+    {  429,  951, 1089, 1816, 3861, 3861, 1556,  969, 1568, 1828 },
+    {  883,  961, 1752,  769, 1468, 1810, 2081, 2346,  613, 1298 },
+    {  803,  895, 1372,  641, 1303, 1708, 1686, 1700, 1306, 1033 } },
+  { {  439, 1267, 1270, 1579,  963, 1193, 1723, 1729, 1198, 1993 },
+    {  705,  725, 1029, 1153, 1176, 1103, 1821, 1567, 1259, 1574 },
+    {  723,  859,  802, 1253,  972, 1202, 1407, 1665, 1520, 1674 },
+    {  894,  960, 1254,  887, 1052, 1607, 1344, 1349,  865, 1150 },
+    {  833, 1312, 1337, 1205,  572, 1288, 1414, 1529, 1088, 1430 },
+    {  842, 1279, 1068, 1861,  862,  688, 1861, 1630, 1039, 1381 },
+    {  766,  938, 1279, 1546, 3338, 1550, 1031, 1542, 1288,  640 },
+    {  715, 1090,  835, 1609, 1100, 1100, 1603, 1019, 1102, 1617 },
+    {  894, 1813, 1500, 1188,  789, 1194, 1491, 1919,  617, 1333 },
+    {  610, 1076, 1644, 1281, 1283,  975, 1179, 1688, 1434,  889 } },
+  { {  544,  971, 1146, 1849, 1221,  740, 1857, 1621, 1683, 2430 },
+    {  723,  705,  961, 1371, 1426,  821, 2081, 2079, 1839, 1380 },
+    {  783,  857,  703, 2145, 1419,  814, 1791, 1310, 1609, 2206 },
+    {  997, 1000, 1153,  792, 1229, 1162, 1810, 1418,  942,  979 },
+    {  901, 1226,  883, 1289,  793,  715, 1904, 1649, 1319, 3108 },
+    {  979, 1478,  782, 2216, 1454,  455, 3092, 1591, 1997, 1664 },
+    {  663, 1110, 1504, 1114, 1522, 3311,  676, 1522, 1530, 1024 },
+    {  605, 1138, 1153, 1314, 1569, 1315, 1157,  804, 1574, 1320 },
+    {  770, 1216, 1218, 1227,  869, 1384, 1232, 1375,  834, 1239 },
+    {  775, 1007,  843, 1216, 1225, 1074, 2527, 1479, 1149,  975 } },
+  { {  477,  817, 1309, 1439, 1708, 1454, 1159, 1241, 1945, 1672 },
+    {  577,  796, 1112, 1271, 1618, 1458, 1087, 1345, 1831, 1265 },
+    {  663,  776,  753, 1940, 1690, 1690, 1227, 1097, 3149, 1361 },
+    {  766, 1299, 1744, 1161, 1565, 1106, 1045, 1230, 1232,  707 },
+    {  915, 1026, 1404, 1182, 1184,  851, 1428, 2425, 1043,  789 },
+    {  883, 1456,  790, 1082, 1086,  985, 1083, 1484, 1238, 1160 },
+    {  507, 1345, 2261, 1995, 1847, 3636,  653, 1761, 2287,  933 },
+    {  553, 1193, 1470, 2057, 2059, 2059,  833,  779, 2058, 1263 },
+    {  766, 1275, 1515, 1039,  957, 1554, 1286, 1540, 1289,  705 },
+    {  499, 1378, 1496, 1385, 1850, 1850, 1044, 2465, 1515,  720 } },
+  { {  553,  930,  978, 2077, 1968, 1481, 1457,  761, 1957, 2362 },
+    {  694,  864,  905, 1720, 1670, 1621, 1429,  718, 2125, 1477 },
+    {  699,  968,  658, 3190, 2024, 1479, 1865,  750, 2060, 2320 },
+    {  733, 1308, 1296, 1062, 1576, 1322, 1062, 1112, 1172,  816 },
+    {  920,  927, 1052,  939,  947, 1156, 1152, 1073, 3056, 1268 },
+    {  723, 1534,  711, 1547, 1294,  892, 1553,  928, 1815, 1561 },
+    {  663, 1366, 1583, 2111, 1712, 3501,  522, 1155, 2130, 1133 },
+    {  614, 1731, 1188, 2343, 1944, 3733, 1287,  487, 3546, 1758 },
+    {  770, 1585, 1312,  826,  884, 2673, 1185, 1006, 1195, 1195 },
+    {  758, 1333, 1273, 1023, 1621, 1162, 1351,  833, 1479,  862 } },
+  { {  376, 1193, 1446, 1149, 1545, 1577, 1870, 1789, 1175, 1823 },
+    {  803,  633, 1136, 1058, 1350, 1323, 1598, 2247, 1072, 1252 },
+    {  614, 1048,  943,  981, 1152, 1869, 1461, 1020, 1618, 1618 },
+    { 1107, 1085, 1282,  592, 1779, 1933, 1648, 2403,  691, 1246 },
+    {  851, 1309, 1223, 1243,  895, 1593, 1792, 2317,  627, 1076 },
+    {  770, 1216, 1030, 1125,  921,  981, 1629, 1131, 1049, 1646 },
+    {  626, 1469, 1456, 1081, 1489, 3278,  981, 1232, 1498,  733 },
+    {  617, 1201,  812, 1220, 1476, 1476, 1478,  970, 1228, 1488 },
+    { 1179, 1393, 1540,  999, 1243, 1503, 1916, 1925,  414, 1614 },
+    {  943, 1088, 1490,  682, 1112, 1372, 1756, 1505,  966,  966 } },
+  { {  322, 1142, 1589, 1396, 2144, 1859, 1359, 1925, 2084, 1518 },
+    {  617,  625, 1241, 1234, 2121, 1615, 1524, 1858, 1720, 1004 },
+    {  553,  851,  786, 1299, 1452, 1560, 1372, 1561, 1967, 1713 },
+    {  770,  977, 1396,  568, 1893, 1639, 1540, 2108, 1430, 1013 },
+    {  684, 1120, 1375,  982,  930, 2719, 1638, 1643,  933,  993 },
+    {  553, 1103,  996, 1356, 1361, 1005, 1507, 1761, 1184, 1268 },
+    {  419, 1247, 1537, 1554, 1817, 3606, 1026, 1666, 1829,  923 },
+    {  439, 1139, 1101, 1257, 3710, 1922, 1205, 1040, 1931, 1529 },
+    {  979,  935, 1269,  847, 1202, 1286, 1530, 1535,  827, 1036 },
+    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
 };

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -16,7 +16,7 @@

 #include "./vp8enci.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -44,7 +44,7 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@@ -11,57 +11,12 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#include <assert.h>
 #include "./vp8enci.h"

-// This table gives, for a given sharpness, the filtering strength to be
-// used (at least) in order to filter a given edge step delta.
-// This is constructed by brute force inspection: for all delta, we iterate
-// over all possible filtering strength / thresh until needs_filter() returns
-// true.
-#define MAX_DELTA_SIZE 64
-static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
-  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
-  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
-    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
-    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
-    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
-    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
-    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
-    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
-    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
-    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
-    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
-    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
-    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
-    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
-  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
-    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
-    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
-};
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

-int VP8FilterStrengthFromDelta(int sharpness, int delta) {
-  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
-  assert(sharpness >= 0 && sharpness <= 7);
-  return kLevelsFromDelta[sharpness][pos];
-}
-
-// -----------------------------------------------------------------------------
 // NOTE: clip1, tables and InitTables are repeated entries of dsp.c
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
@@ -385,29 +340,28 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 // loop filter strength

 void VP8InitFilter(VP8EncIterator* const it) {
-  if (it->lf_stats_ != NULL) {
-    int s, i;
-    InitTables();
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      for (i = 0; i < MAX_LF_LEVELS; i++) {
-        (*it->lf_stats_)[s][i] = 0;
-      }
+  int s, i;
+  if (!it->lf_stats_) return;
+
+  InitTables();
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    for (i = 0; i < MAX_LF_LEVELS; i++) {
+      (*it->lf_stats_)[s][i] = 0;
    }
  }
 }

 void VP8StoreFilterStats(VP8EncIterator* const it) {
  int d;
-  VP8Encoder* const enc = it->enc_;
  const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = it->enc_->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]

  // explore +/-quant range of values around level0
-  const int delta_min = -enc->dqm_[s].quant_;
-  const int delta_max = enc->dqm_[s].quant_;
+  const int delta_min = -it->enc_->dqm_[s].quant_;
+  const int delta_max = it->enc_->dqm_[s].quant_;
  const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;

-  if (it->lf_stats_ == NULL) return;
+  if (!it->lf_stats_) return;

  // NOTE: Currently we are applying filter only across the sublock edges
  // There are two reasons for that.
@@ -431,41 +385,27 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
 }

 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
+  int s;
  VP8Encoder* const enc = it->enc_;
-  if (it->lf_stats_ != NULL) {
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      int i, best_level = 0;
-      // Improvement over filter level 0 should be at least 1e-5 (relatively)
-      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
-      for (i = 1; i < MAX_LF_LEVELS; i++) {
-        const double v = (*it->lf_stats_)[s][i];
-        if (v > best_v) {
-          best_v = v;
-          best_level = i;
-        }
-      }
-      enc->dqm_[s].fstrength_ = best_level;
-    }
-  } else if (enc->config_->filter_strength > 0) {
-    int max_level = 0;
-    int s;
-    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-      VP8SegmentInfo* const dqm = &enc->dqm_[s];
-      // this '>> 3' accounts for some inverse WHT scaling
-      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
-      const int level =
-          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
-      if (level > dqm->fstrength_) {
-        dqm->fstrength_ = level;
-      }
-      if (max_level < dqm->fstrength_) {
-        max_level = dqm->fstrength_;
+
+  if (!it->lf_stats_) {
+    return;
+  }
+  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+    int i, best_level = 0;
+    // Improvement over filter level 0 should be at least 1e-5 (relatively)
+    double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+    for (i = 1; i < MAX_LF_LEVELS; i++) {
+      const double v = (*it->lf_stats_)[s][i];
+      if (v > best_v) {
+        best_v = v;
+        best_level = i;
      }
    }
-    enc->filter_hdr_.level_ = max_level;
+    enc->dqm_[s].fstrength_ = best_level;
  }
 }

-// -----------------------------------------------------------------------------
-
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -18,7 +18,10 @@

 #include "./vp8enci.h"
 #include "./cost.h"
-#include "../webp/format_constants.h"  // RIFF constants
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -36,63 +39,6 @@ typedef struct {
  CostArray*  cost;
 } VP8Residual;

-//------------------------------------------------------------------------------
-// multi-pass convergence
-
-#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
-                              VP8_FRAME_HEADER_SIZE)
-#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
-// we allow 2k of extra head-room in PARTITION0 limit.
-#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
-
-typedef struct {  // struct for organizing convergence in either size or PSNR
-  int is_first;
-  float dq;
-  float q, last_q;
-  double value, last_value;   // PSNR or size
-  double target;
-  int do_size_search;
-} PassStats;
-
-static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
-  const uint64_t target_size = (uint64_t)enc->config_->target_size;
-  const int do_size_search = (target_size != 0);
-  const float target_PSNR = enc->config_->target_PSNR;
-
-  s->is_first = 1;
-  s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
-  s->target = do_size_search ? (double)target_size
-            : (target_PSNR > 0.) ? target_PSNR
-            : 40.;   // default, just in case
-  s->value = s->last_value = 0.;
-  s->do_size_search = do_size_search;
-  return do_size_search;
-}
-
-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
-static float ComputeNextQ(PassStats* const s) {
-  float dq;
-  if (s->is_first) {
-    dq = (s->value > s->target) ? -s->dq : s->dq;
-    s->is_first = 0;
-  } else if (s->value != s->last_value) {
-    const double slope = (s->target - s->value) / (s->last_value - s->value);
-    dq = (float)(slope * (s->last_q - s->q));
-  } else {
-    dq = 0.;  // we're done?!
-  }
-  // Limit variable to avoid large swings.
-  s->dq = Clamp(dq, -30.f, 30.f);
-  s->last_q = s->q;
-  s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
-  return s->q;
-}
-
 //------------------------------------------------------------------------------
 // Tables for level coding

@@ -346,20 +292,31 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }
-  cost = VP8BitCost(1, p0);
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
+  cost = 0;
+  while (n < res->last) {
+    int v = res->coeffs[n];
    const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
+    ++n;
+    if (v == 0) {
+      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
+      cost += t[0];
+      t = res->cost[b][0];
+      continue;
+    }
+    v = abs(v);
+    cost += VP8BitCost(1, p0);
    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
+    {
+      const int ctx = (v == 1) ? 1 : 2;
+      p0 = res->prob[b][ctx][0];
+      t = res->cost[b][ctx];
+    }
  }
  // Last coefficient is always non-zero
  {
    const int v = abs(res->coeffs[n]);
    assert(v != 0);
+    cost += VP8BitCost(1, p0);
    cost += VP8LevelCost(t, v);
    if (n < 15) {
      const int b = VP8EncBands[n + 1];
@@ -728,83 +685,81 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }

-static double GetPSNR(uint64_t mse, uint64_t size) {
-  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
-}
-
 //------------------------------------------------------------------------------
 //  StatLoop(): only collect statistics (number of skips, token usage, ...).
 //  This is used for deciding optimal probabilities. It also modifies the
-//  quantizer value if some target (size, PSNR) was specified.
+//  quantizer value if some target (size, PNSR) was specified.
+
+#define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better

 static void SetLoopParams(VP8Encoder* const enc, float q) {
  // Make sure the quality parameter is inside valid bounds
-  q = Clamp(q, 0.f, 100.f);
+  if (q < 0.) {
+    q = 0;
+  } else if (q > 100.) {
+    q = 100;
+  }

  VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
  SetSegmentProbas(enc);            // compute segment probabilities

  ResetStats(enc);
+  ResetTokenStats(enc);
+
  ResetSSE(enc);
 }

-static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
-                            int nb_mbs, int percent_delta,
-                            PassStats* const s) {
+static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt,
+                       int nb_mbs, float* const PSNR, int percent_delta) {
  VP8EncIterator it;
  uint64_t size = 0;
-  uint64_t size_p0 = 0;
  uint64_t distortion = 0;
  const uint64_t pixel_count = nb_mbs * 384;

+  SetLoopParams(enc, q);
+
  VP8IteratorInit(enc, &it);
-  SetLoopParams(enc, s->q);
  do {
    VP8ModeScore info;
-    VP8IteratorImport(&it, NULL);
+    VP8IteratorImport(&it);
    if (VP8Decimate(&it, &info, rd_opt)) {
      // Just record the number of skips and act like skip_proba is not used.
      enc->proba_.nb_skip_++;
    }
    RecordResiduals(&it, &info);
-    size += info.R + info.H;
-    size_p0 += info.H;
+    size += info.R;
    distortion += info.D;
    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
      return 0;
-    VP8IteratorSaveBoundary(&it);
-  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
+  size += FinalizeSkipProba(enc);
+  size += FinalizeTokenProbas(&enc->proba_);
+  size += enc->segment_hdr_.size_;
+  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;

-  size_p0 += enc->segment_hdr_.size_;
-  if (s->do_size_search) {
-    size += FinalizeSkipProba(enc);
-    size += FinalizeTokenProbas(&enc->proba_);
-    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
-    s->value = (double)size;
-  } else {
-    s->value = GetPSNR(distortion, pixel_count);
+  if (PSNR) {
+    *PSNR = (float)(10.* log10(255. * 255. * pixel_count / distortion));
  }
-  return size_p0;
+  return (int)size;
 }

+// successive refinement increments.
+static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };
+
 static int StatLoop(VP8Encoder* const enc) {
  const int method = enc->method_;
  const int do_search = enc->do_search_;
  const int fast_probe = ((method == 0 || method == 3) && !do_search);
-  int num_pass_left = enc->config_->pass;
+  float q = enc->config_->quality;
+  const int max_passes = enc->config_->pass;
  const int task_percent = 20;
-  const int percent_per_pass =
-      (task_percent + num_pass_left / 2) / num_pass_left;
+  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
  const int final_percent = enc->percent_ + task_percent;
-  const VP8RDLevel rd_opt =
-      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
-  int nb_mbs = enc->mb_w_ * enc->mb_h_;
-  PassStats stats;
-
-  InitPassStats(enc, &stats);
-  ResetTokenStats(enc);
+  int pass;
+  int nb_mbs;

  // Fast mode: quick analysis pass over few mbs. Better than nothing.
+  nb_mbs = enc->mb_w_ * enc->mb_h_;
  if (fast_probe) {
    if (method == 3) {  // we need more stats for method 3 to be reliable.
      nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
@@ -813,35 +768,37 @@ static int StatLoop(VP8Encoder* const enc) {
    }
  }

-  while (num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    const uint64_t size_p0 =
-        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
-    if (size_p0 == 0) return 0;
-#if (DEBUG_SEARCH > 0)
-    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
-           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
+  // No target size: just do several pass without changing 'q'
+  if (!do_search) {
+    for (pass = 0; pass < max_passes; ++pass) {
+      const VP8RDLevel rd_opt = (method >= 3) ? RD_OPT_BASIC : RD_OPT_NONE;
+      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
+        return 0;
+      }
+    }
+  } else {
+    // binary search for a size close to target
+    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
+      float PSNR;
+      int criterion;
+      const int size = OneStatPass(enc, q, RD_OPT_BASIC, nb_mbs, &PSNR,
+                                   percent_per_pass);
+#if DEBUG_SEARCH
+      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
 #endif
-    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
+      if (size == 0) return 0;
+      if (enc->config_->target_PSNR > 0) {
+        criterion = (PSNR < enc->config_->target_PSNR);
+      } else {
+        criterion = (size < enc->config_->target_size);
+      }
+      // dichotomize
+      if (criterion) {
+        q += dqs[pass];
+      } else {
+        q -= dqs[pass];
+      }
    }
-    if (is_last_pass) {
-      break;
-    }
-    // If no target size: just do several pass without changing 'q'
-    if (do_search) {
-      ComputeNextQ(&stats);
-      if (fabs(stats.dq) <= DQ_LIMIT) break;
-    }
-  }
-  if (!do_search || !stats.do_size_search) {
-    // Need to finalize probas now, since it wasn't done during the search.
-    FinalizeSkipProba(enc);
-    FinalizeTokenProbas(&enc->proba_);
  }
  VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
  return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
@@ -878,7 +835,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
  }

  if (ok) {      // All good. Finish up.
-    if (enc->pic_->stats != NULL) {  // finalize byte counters...
+    if (enc->pic_->stats) {           // finalize byte counters...
      int i, s;
      for (i = 0; i <= 2; ++i) {
        for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
@@ -920,7 +877,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
    const int dont_use_skip = !enc->proba_.use_skip_proba_;
    const VP8RDLevel rd_opt = enc->rd_opt_level_;

-    VP8IteratorImport(&it, NULL);
+    VP8IteratorImport(&it);
    // Warning! order is important: first call VP8Decimate() and
    // *then* decide how to code the skip decision if there's one.
    if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
@@ -937,8 +894,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
    VP8StoreFilterStats(&it);
    VP8IteratorExport(&it);
    ok = VP8IteratorProgress(&it, 20);
-    VP8IteratorSaveBoundary(&it);
-  } while (ok && VP8IteratorNext(&it));
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));

  return PostLoopFinalize(&it, ok);
 }
@@ -948,110 +904,62 @@ int VP8EncLoop(VP8Encoder* const enc) {

 #if !defined(DISABLE_TOKEN_BUFFER)

-#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
+#define MIN_COUNT 96   // minimum number of macroblocks before updating stats

 int VP8EncTokenLoop(VP8Encoder* const enc) {
-  // Roughly refresh the proba eight times per pass
+  int ok;
+  // Roughly refresh the proba height times per pass
  int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
-  int num_pass_left = enc->config_->pass;
-  const int do_search = enc->do_search_;
+  int cnt;
  VP8EncIterator it;
  VP8Proba* const proba = &enc->proba_;
  const VP8RDLevel rd_opt = enc->rd_opt_level_;
-  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
-  PassStats stats;
-  int ok;
-
-  InitPassStats(enc, &stats);
-  ok = PreLoopInitialize(enc);
-  if (!ok) return 0;

  if (max_count < MIN_COUNT) max_count = MIN_COUNT;
+  cnt = max_count;

  assert(enc->num_parts_ == 1);
  assert(enc->use_tokens_);
  assert(proba->use_skip_proba_ == 0);
  assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
-  assert(num_pass_left > 0);
+  assert(!enc->do_search_);         // TODO(skal): handle pass and dichotomy

-  while (ok && num_pass_left-- > 0) {
-    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
-                             (num_pass_left == 0) ||
-                             (enc->max_i4_header_bits_ == 0);
-    uint64_t size_p0 = 0;
-    uint64_t distortion = 0;
-    int cnt = max_count;
-    VP8IteratorInit(enc, &it);
-    SetLoopParams(enc, stats.q);
-    if (is_last_pass) {
-      ResetTokenStats(enc);
-      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
+  SetLoopParams(enc, enc->config_->quality);
+
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
+
+  VP8IteratorInit(enc, &it);
+  VP8InitFilter(&it);
+  do {
+    VP8ModeScore info;
+    VP8IteratorImport(&it);
+    if (--cnt < 0) {
+      FinalizeTokenProbas(proba);
+      VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+      cnt = max_count;
    }
-    VP8TBufferClear(&enc->tokens_);
-    do {
-      VP8ModeScore info;
-      VP8IteratorImport(&it, NULL);
-      if (--cnt < 0) {
-        FinalizeTokenProbas(proba);
-        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
-        cnt = max_count;
-      }
-      VP8Decimate(&it, &info, rd_opt);
-      RecordTokens(&it, &info, &enc->tokens_);
-      size_p0 += info.H;
-      distortion += info.D;
+    VP8Decimate(&it, &info, rd_opt);
+    RecordTokens(&it, &info, &enc->tokens_);
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (enc->use_layer_) {
-        VP8EncCodeLayerBlock(&it);
-      }
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
+    }
 #endif
-      if (is_last_pass) {
-        StoreSideInfo(&it);
-        VP8StoreFilterStats(&it);
-        VP8IteratorExport(&it);
-        ok = VP8IteratorProgress(&it, 20);
-      }
-      VP8IteratorSaveBoundary(&it);
-    } while (ok && VP8IteratorNext(&it));
-    if (!ok) break;
+    StoreSideInfo(&it);
+    VP8StoreFilterStats(&it);
+    VP8IteratorExport(&it);
+    ok = VP8IteratorProgress(&it, 20);
+  } while (ok && VP8IteratorNext(&it, it.yuv_out_));

-    size_p0 += enc->segment_hdr_.size_;
-    if (stats.do_size_search) {
-      uint64_t size = FinalizeTokenProbas(&enc->proba_);
-      size += VP8EstimateTokenSize(&enc->tokens_,
-                                   (const uint8_t*)proba->coeffs_);
-      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
-      size += HEADER_SIZE_ESTIMATE;
-      stats.value = (double)size;
-    } else {  // compute and store PSNR
-      stats.value = GetPSNR(distortion, pixel_count);
-    }
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);

-#if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
-           num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
-#endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
-      ++num_pass_left;
-      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
-      continue;                        // ...and start over
-    }
-    if (is_last_pass) {
-      break;   // done
-    }
-    if (do_search) {
-      ComputeNextQ(&stats);  // Adjust q
-    }
-  }
  if (ok) {
-    if (!stats.do_size_search) {
-      FinalizeTokenProbas(&enc->proba_);
-    }
+    FinalizeTokenProbas(proba);
    ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                       (const uint8_t*)proba->coeffs_, 1);
  }
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+
  return PostLoopFinalize(&it, ok);
 }

@@ -1066,3 +974,6 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/histogram.c
+++ b/src/enc/histogram.c
@@ -90,10 +90,12 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
    ++histo->literal_[literal_ix];
  } else {
-    int code, extra_bits;
-    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
+    int code, extra_bits_count, extra_bits_value;
+    PrefixEncode(PixOrCopyLength(v),
+                 &code, &extra_bits_count, &extra_bits_value);
    ++histo->literal_[256 + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    PrefixEncode(PixOrCopyDistance(v),
+                 &code, &extra_bits_count, &extra_bits_value);
    ++histo->distance_[code];
  }
 }
--- a/src/enc/histogram.h
+++ b/src/enc/histogram.h
@@ -24,7 +24,7 @@
 #include "../webp/format_constants.h"
 #include "../webp/types.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -94,7 +94,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                             VP8LHistogramSet* const image_in,
                             uint16_t* const histogram_symbols);

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif

--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@@ -15,16 +15,21 @@

 #include "./vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // VP8Iterator
 //------------------------------------------------------------------------------

 static void InitLeft(VP8EncIterator* const it) {
-  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
+  const VP8Encoder* const enc = it->enc_;
+  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
      (it->y_ > 0) ? 129 : 127;
-  memset(it->y_left_, 129, 16);
-  memset(it->u_left_, 129, 8);
-  memset(it->v_left_, 129, 8);
+  memset(enc->y_left_, 129, 16);
+  memset(enc->u_left_, 129, 8);
+  memset(enc->v_left_, 129, 8);
  it->left_nz_[8] = 0;
 }

@@ -35,60 +40,43 @@ static void InitTop(VP8EncIterator* const it) {
  memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }

-void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
-  VP8Encoder* const enc = it->enc_;
-  it->x_ = 0;
-  it->y_ = y;
-  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
-  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
-  it->nz_ = enc->nz_;
-  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
-  it->y_top_ = enc->y_top_;
-  it->uv_top_ = enc->uv_top_;
-  InitLeft(it);
-}
-
 void VP8IteratorReset(VP8EncIterator* const it) {
  VP8Encoder* const enc = it->enc_;
-  VP8IteratorSetRow(it, 0);
-  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
+  it->x_ = 0;
+  it->y_ = 0;
+  it->y_offset_ = 0;
+  it->uv_offset_ = 0;
+  it->mb_ = enc->mb_info_;
+  it->preds_ = enc->preds_;
+  it->nz_ = enc->nz_;
+  it->bw_ = &enc->parts_[0];
+  it->done_ = enc->mb_w_* enc->mb_h_;
  InitTop(it);
  InitLeft(it);
  memset(it->bit_count_, 0, sizeof(it->bit_count_));
  it->do_trellis_ = 0;
 }

-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
-  it->count_down_ = it->count_down0_ = count_down;
-}
-
-int VP8IteratorIsDone(const VP8EncIterator* const it) {
-  return (it->count_down_ <= 0);
-}
-
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  it->enc_ = enc;
  it->y_stride_  = enc->pic_->y_stride;
  it->uv_stride_ = enc->pic_->uv_stride;
-  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
-  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
-  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
-  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
+  // TODO(later): for multithreading, these should be owned by 'it'.
+  it->yuv_in_   = enc->yuv_in_;
+  it->yuv_out_  = enc->yuv_out_;
+  it->yuv_out2_ = enc->yuv_out2_;
+  it->yuv_p_    = enc->yuv_p_;
  it->lf_stats_ = enc->lf_stats_;
  it->percent0_ = enc->percent_;
-  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
-  it->u_left_ = it->y_left_ + 16 + 16;
-  it->v_left_ = it->u_left_ + 16;
  VP8IteratorReset(it);
 }

 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
  VP8Encoder* const enc = it->enc_;
-  if (delta && enc->pic_->progress_hook != NULL) {
-    const int done = it->count_down0_ - it->count_down_;
-    const int percent = (it->count_down0_ <= 0)
+  if (delta && enc->pic_->progress_hook) {
+    const int percent = (enc->mb_h_ <= 1)
                      ? it->percent0_
-                      : it->percent0_ + delta * done / it->count_down0_;
+                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
    return WebPReportProgress(enc->pic_, percent, &enc->percent_);
  }
  return 1;
@@ -98,8 +86,6 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.

-static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
-
 static void ImportBlock(const uint8_t* src, int src_stride,
                        uint8_t* dst, int w, int h, int size) {
  int i;
@@ -117,55 +103,30 @@ static void ImportBlock(const uint8_t* src, int src_stride,
  }
 }

-static void ImportLine(const uint8_t* src, int src_stride,
-                       uint8_t* dst, int len, int total_len) {
-  int i;
-  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
-  for (; i < total_len; ++i) dst[i] = dst[len - 1];
-}
-
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
+void VP8IteratorImport(const VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
  const int x = it->x_, y = it->y_;
  const WebPPicture* const pic = enc->pic_;
-  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
  const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
  const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  const int w = MinSize(pic->width - x * 16, 16);
-  const int h = MinSize(pic->height - y * 16, 16);
-  const int uv_w = (w + 1) >> 1;
-  const int uv_h = (h + 1) >> 1;
+  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
+  uint8_t* const udst = it->yuv_in_ + U_OFF;
+  uint8_t* const vdst = it->yuv_in_ + V_OFF;
+  int w = (pic->width - x * 16);
+  int h = (pic->height - y * 16);

-  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
-  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
-  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
+  if (w > 16) w = 16;
+  if (h > 16) h = 16;

-  if (tmp_32 == NULL) return;
+  // Luma plane
+  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);

-  // Import source (uncompressed) samples into boundary.
-  if (x == 0) {
-    InitLeft(it);
-  } else {
-    if (y == 0) {
-      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
-    } else {
-      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
-      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
-      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
-    }
-    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
-    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
-    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
-  }
-
-  it->y_top_  = tmp_32 + 0;
-  it->uv_top_ = tmp_32 + 16;
-  if (y == 0) {
-    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
-  } else {
-    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
-    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
-    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
+  {   // U/V planes
+    const int uv_w = (w + 1) >> 1;
+    const int uv_h = (h + 1) >> 1;
+    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
+    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
  }
 }

@@ -281,44 +242,48 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 #undef BIT

 //------------------------------------------------------------------------------
-// Advance to the next position, doing the bookkeeping.
+// Advance to the next position, doing the bookeeping.

-void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save) {
  VP8Encoder* const enc = it->enc_;
-  const int x = it->x_, y = it->y_;
-  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
-  if (x < enc->mb_w_ - 1) {   // left
-    int i;
-    for (i = 0; i < 16; ++i) {
-      it->y_left_[i] = ysrc[15 + i * BPS];
+  if (block_to_save) {
+    const int x = it->x_, y = it->y_;
+    const uint8_t* const ysrc = block_to_save + Y_OFF;
+    const uint8_t* const usrc = block_to_save + U_OFF;
+    if (x < enc->mb_w_ - 1) {   // left
+      int i;
+      for (i = 0; i < 16; ++i) {
+        enc->y_left_[i] = ysrc[15 + i * BPS];
+      }
+      for (i = 0; i < 8; ++i) {
+        enc->u_left_[i] = usrc[7 + i * BPS];
+        enc->v_left_[i] = usrc[15 + i * BPS];
+      }
+      // top-left (before 'top'!)
+      enc->y_left_[-1] = enc->y_top_[x * 16 + 15];
+      enc->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
+      enc->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
    }
-    for (i = 0; i < 8; ++i) {
-      it->u_left_[i] = uvsrc[7 + i * BPS];
-      it->v_left_[i] = uvsrc[15 + i * BPS];
+    if (y < enc->mb_h_ - 1) {  // top
+      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
+      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
    }
-    // top-left (before 'top'!)
-    it->y_left_[-1] = it->y_top_[15];
-    it->u_left_[-1] = it->uv_top_[0 + 7];
-    it->v_left_[-1] = it->uv_top_[8 + 7];
  }
-  if (y < enc->mb_h_ - 1) {  // top
-    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
-    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
-  }
-}

-int VP8IteratorNext(VP8EncIterator* const it) {
+  it->mb_++;
  it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
-    VP8IteratorSetRow(it, ++it->y_);
+  it->nz_++;
+  it->x_++;
+  if (it->x_ == enc->mb_w_) {
+    it->x_ = 0;
+    it->y_++;
+    it->bw_ = &enc->parts_[it->y_ & (enc->num_parts_ - 1)];
+    it->preds_ = enc->preds_ + it->y_ * 4 * enc->preds_w_;
+    it->nz_ = enc->nz_;
+    InitLeft(it);
  }
-  return (0 < --it->count_down_);
+  return (0 < --it->done_);
 }

 //------------------------------------------------------------------------------
@@ -405,15 +370,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {

  // Import the boundary samples
  for (i = 0; i < 17; ++i) {    // left
-    it->i4_boundary_[i] = it->y_left_[15 - i];
+    it->i4_boundary_[i] = enc->y_left_[15 - i];
  }
  for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = it->y_top_[i];
+    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
  }
  // top-right samples have a special case on the far right of the picture
  if (it->x_ < enc->mb_w_ - 1) {
    for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = it->y_top_[i];
+      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
    }
  } else {    // else, replicate the last valid pixel four times
    for (i = 16; i < 16 + 4; ++i) {
@@ -454,3 +419,6 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/layer.c
+++ b/src/enc/layer.c
@@ -15,6 +15,10 @@

 #include "./vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------

 void VP8EncInitLayer(VP8Encoder* const enc) {
@@ -42,3 +46,6 @@ void VP8EncDeleteLayer(VP8Encoder* enc) {
  free(enc->layer_data_);
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -16,15 +16,14 @@
 #include <math.h>

 #include "./vp8enci.h"
-#include "../utils/alpha_processing.h"
-#include "../utils/random.h"
 #include "../utils/rescaler.h"
 #include "../utils/utils.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"

-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 #define HALVE(x) (((x) + 1) >> 1)
 #define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
@@ -35,10 +34,6 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)

-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
@@ -123,7 +118,6 @@ int WebPPictureAlloc(WebPPicture* picture) {
        picture->v0 = mem;
        mem += uv0_size;
      }
-      (void)mem;  // makes the static analyzer happy
    } else {
      void* memory;
      const uint64_t argb_size = (uint64_t)width * height;
@@ -401,28 +395,6 @@ static void RescalePlane(const uint8_t* src,
  }
 }

-static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
-  uint32_t* ptr = pic->argb;
-  int y;
-  for (y = 0; y < pic->height; ++y) {
-    WebPMultARGBRow(ptr, pic->width, inverse);
-    ptr += pic->argb_stride;
-  }
-}
-
-static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
-  const uint8_t* ptr_a = pic->a;
-  if (ptr_a != NULL) {
-    uint8_t* ptr_y = pic->y;
-    int y;
-    for (y = 0; y < pic->height; ++y) {
-      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
-      ptr_y += pic->y_stride;
-      ptr_a += pic->a_stride;
-    }
-  }
-}
-
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
  WebPPicture tmp;
  int prev_width, prev_height;
@@ -453,19 +425,9 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
      WebPPictureFree(&tmp);
      return 0;
    }
-    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }

-    // We take transparency into account on the luma plane only. That's not
-    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
                 tmp.y, width, height, tmp.y_stride, work, 1);
-    AlphaMultiplyY(&tmp, 1);
-
    RescalePlane(pic->u,
                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
                 tmp.u,
@@ -475,6 +437,10 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                 tmp.v,
                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);

+    if (tmp.a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
    if (tmp.u0 != NULL) {
      const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
@@ -492,16 +458,12 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
      WebPPictureFree(&tmp);
      return 0;
    }
-    // In order to correctly interpolate colors, we need to apply the alpha
-    // weighting first (black-matting), scale the RGB values, and remove
-    // the premultiplication afterward (while preserving the alpha channel).
-    AlphaMultiplyARGB(pic, 0);
+
    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
                 pic->argb_stride * 4,
                 (uint8_t*)tmp.argb, width, height,
                 tmp.argb_stride * 4,
                 work, 4);
-    AlphaMultiplyARGB(&tmp, 1);
  }
  WebPPictureFree(pic);
  free(work);
@@ -590,101 +552,20 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion

-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
-
-static int kLinearToGammaTab[kGammaTabSize + 1];
-static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
-
-static void InitGammaTables(void) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = 1. / kGammaScale;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
-    }
-    kGammaTablesOk = 1;
-  }
-}
-
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  (void)shift;
-  return v;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
-
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
-
+// TODO: we can do better than simply 2x2 averaging on U/V samples.
+#define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
+                   (ptr)[rgb_stride] + (ptr)[rgb_stride + step])
+#define SUM2H(ptr) (2 * (ptr)[0] + 2 * (ptr)[step])
+#define SUM2V(ptr) (2 * (ptr)[0] + 2 * (ptr)[rgb_stride])
+#define SUM1(ptr)  (4 * (ptr)[0])
 #define RGB_TO_UV(x, y, SUM) {                           \
  const int src = (2 * (step * (x) + (y) * rgb_stride)); \
  const int dst = (x) + (y) * picture->uv_stride;        \
  const int r = SUM(r_ptr + src);                        \
  const int g = SUM(g_ptr + src);                        \
  const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
+  picture->u[dst] = VP8RGBToU(r, g, b);                  \
+  picture->v[dst] = VP8RGBToV(r, g, b);                  \
 }

 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -693,8 +574,8 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
  const int r = SUM(r_ptr + src);                        \
  const int g = SUM(g_ptr + src);                        \
  const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
-  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
+  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
+  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
 }

 static void MakeGray(WebPPicture* const picture) {
@@ -713,14 +594,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                              const uint8_t* const a_ptr,
                              int step,         // bytes per pixel
                              int rgb_stride,   // bytes per scanline
-                              float dithering,
                              WebPPicture* const picture) {
  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
  int x, y;
  const int width = picture->width;
  const int height = picture->height;
  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;

  picture->colorspace = uv_csp;
  picture->use_argb = 0;
@@ -729,15 +608,12 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
  }
  if (!WebPPictureAlloc(picture)) return 0;

-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
-
  // Import luma plane
  for (y = 0; y < height; ++y) {
    for (x = 0; x < width; ++x) {
      const int offset = step * x + y * rgb_stride;
      picture->y[x + y * picture->y_stride] =
-          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
+          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
    }
  }

@@ -785,7 +661,6 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,

  if (has_alpha) {
    assert(step >= 4);
-    assert(picture->a != NULL);
    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; ++x) {
        picture->a[x + y * picture->a_stride] =
@@ -808,7 +683,7 @@ static int Import(WebPPicture* const picture,

  if (!picture->use_argb) {
    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
+                              picture);
  }
  if (import_alpha) {
    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
@@ -823,7 +698,10 @@ static int Import(WebPPicture* const picture,
      for (x = 0; x < width; ++x) {
        const int offset = step * x + y * rgb_stride;
        const uint32_t argb =
-            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+            0xff000000u |
+            (r_ptr[offset] << 16) |
+            (g_ptr[offset] <<  8) |
+            (b_ptr[offset]);
        picture->argb[x + y * picture->argb_stride] = argb;
      }
    }
@@ -884,7 +762,8 @@ int WebPPictureImportBGRX(WebPPicture* picture,

 int WebPPictureYUVAToARGB(WebPPicture* picture) {
  if (picture == NULL) return 0;
-  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+  if (picture->memory_ == NULL || picture->y == NULL ||
+      picture->u == NULL || picture->v == NULL) {
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
  }
  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
@@ -907,7 +786,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);

    // First row, with replicated top samples.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
    cur_y += picture->y_stride;
    dst += argb_stride;
    // Center rows.
@@ -940,8 +819,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
  return 1;
 }

-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
  if (picture == NULL) return 0;
  if (picture->argb == NULL) {
    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
@@ -957,8 +835,7 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
    PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
    tmp.use_argb = 0;
    tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
-                            &tmp)) {
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
    }
    // Copy back the YUV specs into 'picture'.
@@ -970,10 +847,6 @@ int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
  return 1;
 }

-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
-}
-
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.

@@ -1039,91 +912,6 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2

-//------------------------------------------------------------------------------
-// Blend color and remove transparency info
-
-#define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
-#define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
-
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
-  const int red = (background_rgb >> 16) & 0xff;
-  const int green = (background_rgb >> 8) & 0xff;
-  const int blue = (background_rgb >> 0) & 0xff;
-  VP8Random rg;
-  int x, y;
-  if (pic == NULL) return;
-  VP8InitRandom(&rg, 0.f);
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
-    const int Y0 = RGBToY(red, green, blue, &rg);
-    // VP8RGBToU/V expects the u/v values summed over four pixels
-    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
-    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
-      // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
-        if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
-        }
-      }
-      // Chroma blending every even line
-      if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
-        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
-        for (x = 0; x < uv_width; ++x) {
-          // Average four alpha values into a single blending weight.
-          // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
-              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
-              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-        if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-      }
-      memset(a_ptr, 0xff, pic->width);
-    }
-  } else {
-    uint32_t* argb = pic->argb;
-    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = (argb[x] >> 24) & 0xff;
-        if (alpha != 0xff) {
-          if (alpha > 0) {
-            int r = (argb[x] >> 16) & 0xff;
-            int g = (argb[x] >>  8) & 0xff;
-            int b = (argb[x] >>  0) & 0xff;
-            r = BLEND(red, r, alpha);
-            g = BLEND(green, g, alpha);
-            b = BLEND(blue, b, alpha);
-            argb[x] = MakeARGB32(r, g, b);
-          } else {
-            argb[x] = background;
-          }
-        }
-      }
-      argb += pic->argb_stride;
-    }
-  }
-}
-
-#undef BLEND
-#undef BLEND_10BIT
-
 //------------------------------------------------------------------------------
 // local-min distortion
 //
@@ -1300,10 +1088,10 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
  return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }

-ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
-ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
-ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR);
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA);
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);

 #undef ENCODE_FUNC

@@ -1313,12 +1101,15 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
  return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
 }

-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);

 #undef LOSSLESS_ENCODE_FUNC

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -13,7 +13,6 @@

 #include <assert.h>
 #include <math.h>
-#include <stdlib.h>  // for abs()

 #include "./vp8enci.h"
 #include "./cost.h"
@@ -25,78 +24,18 @@

 #define MID_ALPHA 64      // neutral value for susceptibility
 #define MIN_ALPHA 30      // lowest usable value for susceptibility
-#define MAX_ALPHA 100     // higher meaningful value for susceptibility
+#define MAX_ALPHA 100     // higher meaninful value for susceptibility

 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                          // power-law modulation. Must be strictly less than 1.

 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision

-// number of non-zero coeffs below which we consider the block very flat
-// (and apply a penalty to complex predictions)
-#define FLATNESS_LIMIT_I16 10      // I16 mode
-#define FLATNESS_LIMIT_I4  3       // I4 mode
-#define FLATNESS_LIMIT_UV  2       // UV mode
-#define FLATNESS_PENALTY   140     // roughly ~1bit per block
-
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)

-// #define DEBUG_BLOCK
-
-//------------------------------------------------------------------------------
-
-#if defined(DEBUG_BLOCK)
-
-#include <stdio.h>
-#include <stdlib.h>
-
-static void PrintBlockInfo(const VP8EncIterator* const it,
-                           const VP8ModeScore* const rd) {
-  int i, j;
-  const int is_i16 = (it->mb_->type_ == 1);
-  printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
-    printf("     ");
-    for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
-    }
-    printf("\n");
-  }
-  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
-    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
-    (int)rd->score);
-  if (is_i16) {
-    printf("Mode: %d\n", rd->mode_i16);
-    printf("y_dc_levels:");
-    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
-    printf("\n");
-  } else {
-    printf("Modes[16]: ");
-    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
-    printf("\n");
-  }
-  printf("y_ac_levels:\n");
-  for (j = 0; j < 16; ++j) {
-    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
-      printf("%4d ", rd->y_ac_levels[j][i]);
-    }
-    printf("\n");
-  }
-  printf("\n");
-  printf("uv_levels (mode=%d):\n", rd->mode_uv);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 16; ++i) {
-      printf("%4d ", rd->uv_levels[j][i]);
-    }
-    printf("\n");
-  }
-}
-
-#endif   // DEBUG_BLOCK
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif

 //------------------------------------------------------------------------------

@@ -165,13 +104,31 @@ static const uint16_t kAcTable2[128] = {
  385, 393, 401, 409, 416, 424, 432, 440
 };

-static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
-  { 96, 110 }, { 96, 108 }, { 110, 115 }
+static const uint16_t kCoeffThresh[16] = {
+  0,  10, 20, 30,
+  10, 20, 30, 30,
+  20, 30, 30, 30,
+  30, 30, 30, 30
 };

-// Sharpening by (slightly) raising the hi-frequency coeffs.
+// TODO(skal): tune more. Coeff thresholding?
+static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 },
+  { 96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96,
+    96, 96, 96, 96 }
+};
+
+// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
 // Hack-ish but helpful for mid-bitrate range. Use with care.
-#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 static const uint8_t kFreqSharpening[16] = {
  0,  30, 60, 90,
  30, 60, 90, 90,
@@ -184,30 +141,20 @@ static const uint8_t kFreqSharpening[16] = {

 // Returns the average quantizer
 static int ExpandMatrix(VP8Matrix* const m, int type) {
-  int i, sum;
-  for (i = 0; i < 2; ++i) {
-    const int is_ac_coeff = (i > 0);
-    const int bias = kBiasMatrices[type][is_ac_coeff];
-    m->iq_[i] = (1 << QFIX) / m->q_[i];
-    m->bias_[i] = BIAS(bias);
-    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
-    //   * zero if coeff <= zthresh
-    //   * non-zero if coeff > zthresh
-    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
-  }
+  int i;
+  int sum = 0;
  for (i = 2; i < 16; ++i) {
    m->q_[i] = m->q_[1];
-    m->iq_[i] = m->iq_[1];
-    m->bias_[i] = m->bias_[1];
-    m->zthresh_[i] = m->zthresh_[1];
  }
-  for (sum = 0, i = 0; i < 16; ++i) {
-    if (type == 0) {  // we only use sharpening for AC luma coeffs
-      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
-    } else {
-      m->sharpen_[i] = 0;
-    }
-    sum += m->q_[i];
+  for (i = 0; i < 16; ++i) {
+    const int j = kZigzag[i];
+    const int bias = kBiasMatrices[type][j];
+    m->iq_[j] = (1 << QFIX) / m->q_[j];
+    m->bias_[j] = BIAS(bias);
+    // TODO(skal): tune kCoeffThresh[]
+    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
+    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
+    sum += m->q_[j];
  }
  return (sum + 8) >> 4;
 }
@@ -235,17 +182,17 @@ static void SetupMatrices(VP8Encoder* enc) {
    q16 = ExpandMatrix(&m->y2_, 1);
    quv = ExpandMatrix(&m->uv_, 2);

-    m->lambda_i4_          = (3 * q4 * q4) >> 7;
-    m->lambda_i16_         = (3 * q16 * q16);
-    m->lambda_uv_          = (3 * quv * quv) >> 6;
-    m->lambda_mode_        = (1 * q4 * q4) >> 7;
-    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
-    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
-    m->lambda_trellis_uv_  = (quv *quv) << 1;
-    m->tlambda_            = (tlambda_scale * q4) >> 5;
-
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
-    m->max_edge_  = 0;
+    // TODO: Switch to kLambda*[] tables?
+    {
+      m->lambda_i4_  = (3 * q4 * q4) >> 7;
+      m->lambda_i16_ = (3 * q16 * q16);
+      m->lambda_uv_  = (3 * quv * quv) >> 6;
+      m->lambda_mode_    = (1 * q4 * q4) >> 7;
+      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
+      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
+      m->lambda_trellis_uv_  = (quv *quv) << 1;
+      m->tlambda_            = (tlambda_scale * q4) >> 5;
+    }
  }
 }

@@ -254,21 +201,16 @@ static void SetupMatrices(VP8Encoder* enc) {

 // Very small filter-strength values have close to no visual effect. So we can
 // save a little decoding-CPU by turning filtering off for these.
-#define FSTRENGTH_CUTOFF 2
+#define FSTRENGTH_CUTOFF 3

 static void SetupFilterStrength(VP8Encoder* const enc) {
  int i;
-  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
-  const int level0 = 5 * enc->config_->filter_strength;
+  const int level0 = enc->config_->filter_strength;
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
-    VP8SegmentInfo* const m = &enc->dqm_[i];
-    // We focus on the quantization of AC coeffs.
-    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
-    const int base_strength =
-        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
-    // Segments with lower complexity ('beta') will be less filtered.
-    const int f = base_strength * level0 / (256 + m->beta_);
-    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
+    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
+    const int f = level / (256 + enc->dqm_[i].beta_);
+    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
  }
  // We record the initial strength (mainly for the case of 1-segment only).
  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
@@ -292,7 +234,7 @@ static double QualityToCompression(double c) {
  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
  // in the mid-quant range. So we scale the compressibility inversely to
  // this power-law: quant ~= compression ^ 1/3. This law holds well for
-  // low quant. Finer modeling for high-quant would make use of kAcTable[]
+  // low quant. Finer modelling for high-quant would make use of kAcTable[]
  // more explicitly.
  const double v = pow(linear_c, 1 / 3.);
  return v;
@@ -425,14 +367,16 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };

 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
  VP8EncPredLuma16(it->yuv_p_, left, top);
 }

 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
+  const VP8Encoder* const enc = it->enc_;
+  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
  VP8EncPredChroma8(it->yuv_p_, left, top);
 }

@@ -488,7 +432,6 @@ static void InitScore(VP8ModeScore* const rd) {
  rd->D  = 0;
  rd->SD = 0;
  rd->R  = 0;
-  rd->H  = 0;
  rd->nz = 0;
  rd->score = MAX_COST;
 }
@@ -497,7 +440,6 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->D  = src->D;
  dst->SD = src->SD;
  dst->R  = src->R;
-  dst->H  = src->H;
  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
  dst->score = src->score;
 }
@@ -506,7 +448,6 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->D  += src->D;
  dst->SD += src->SD;
  dst->R  += src->R;
-  dst->H  += src->H;
  dst->nz |= src->nz;     // here, new nz bits are accumulated.
  dst->score += src->score;
 }
@@ -535,7 +476,7 @@ typedef struct {

 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
  // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
 }

 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
@@ -598,10 +539,11 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    // note: it's important to take sign of the _original_ coeff,
    // so we don't have to consider level < 0 afterward.
    const int sign = (in[j] < 0);
-    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    int level0 = QUANTDIV(coeff0, iQ, B);
-    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
+    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0;
+    if (coeff0 > 2047) coeff0 = 2047;

+    level0 = QUANTDIV(coeff0, iQ, B);
    // test all alternate level values around level0.
    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
      Node* const cur = &NODE(n, m);
@@ -613,7 +555,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
      cur->sign = sign;
      cur->level = level;
      cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level >= 2048 || level < 0) {   // node is dead?
        cur->cost = MAX_COST;
        continue;
      }
@@ -706,10 +648,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                              VP8ModeScore* const rd,
                              uint8_t* const yuv_out,
                              int mode) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
  const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  int nz = 0;
  int n;
  int16_t tmp[16][16], dc_tmp[16];
@@ -718,7 +660,7 @@ static int ReconstructIntra16(VP8EncIterator* const it,
    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
  }
  VP8FTransformWHT(tmp[0], dc_tmp);
-  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
+  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;

  if (DO_TRELLIS_I16 && it->do_trellis_) {
    int x, y;
@@ -813,18 +755,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,

 //------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
-// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
-
-static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
-  // We look at the first three AC coefficients to determine what is the average
-  // delta between each sub-4x4 block.
-  const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
-  max_v = (v2 > max_v) ? v2 : max_v;
-  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
-}
+// Pick the mode is lower RD-cost = Rate + lamba * Distortion.

 static void SwapPtr(uint8_t** a, uint8_t** b) {
  uint8_t* const tmp = *a;
@@ -836,23 +767,9 @@ static void SwapOut(VP8EncIterator* const it) {
  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }

-static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
-  score_t score = 0;
-  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
-    int i;
-    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
-      score += (levels[i] != 0);
-      if (score > thresh) return 0;
-    }
-    levels += 16;
-  }
-  return 1;
-}
-
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 16;
-  VP8Encoder* const enc = it->enc_;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8Encoder* const enc = it->enc_;
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_i16_;
  const int tlambda = dqm->tlambda_;
  const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -871,13 +788,8 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
    rd16.D = VP8SSE16x16(src, tmp_dst);
    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
            : 0;
-    rd16.H = VP8FixedCostsI16[mode];
    rd16.R = VP8GetCostLuma16(it, &rd16);
-    if (mode > 0 &&
-        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
-      // penalty to avoid flat area to be mispredicted by complex mode
-      rd16.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd16.R += VP8FixedCostsI16[mode];

    // Since we always examine Intra16 first, we can overwrite *rd directly.
    SetRDScore(lambda, &rd16);
@@ -892,13 +804,6 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
  }
  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
  VP8SetIntra16Mode(it, rd->mode_i16);
-
-  // we have a blocky macroblock (only DCs are non-zero) with fairly high
-  // distortion, record max delta so we can later adjust the minimal filtering
-  // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
-    StoreMaxDelta(dqm, rd->y_dc_levels);
-  }
 }

 //------------------------------------------------------------------------------
@@ -928,11 +833,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
  }

  InitScore(&rd_best);
-  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
-  SetRDScore(dqm->lambda_mode_, &rd_best);
+  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
  VP8IteratorStartI4(it);
  do {
-    const int kNumBlocks = 1;
    VP8ModeScore rd_i4;
    int mode;
    int best_mode = -1;
@@ -956,11 +859,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
      rd_tmp.SD =
          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                  : 0;
-      rd_tmp.H = mode_costs[mode];
      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
-      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
-        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
-      }
+      rd_tmp.R += mode_costs[mode];

      SetRDScore(lambda, &rd_tmp);
      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
@@ -972,17 +872,14 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    }
    SetRDScore(dqm->lambda_mode_, &rd_i4);
    AddScore(&rd_best, &rd_i4);
-    if (rd_best.score >= rd->score) {
-      return 0;
-    }
-    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
-    if (total_header_bits > enc->max_i4_header_bits_) {
+    total_header_bits += mode_costs[best_mode];
+    if (rd_best.score >= rd->score ||
+        total_header_bits > enc->max_i4_header_bits_) {
      return 0;
    }
    // Copy selected samples if not in the right place already.
-    if (best_block != best_blocks + VP8Scan[it->i4_]) {
+    if (best_block != best_blocks + VP8Scan[it->i4_])
      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
-    }
    rd->modes_i4[it->i4_] = best_mode;
    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
  } while (VP8IteratorRotateI4(it, best_blocks));
@@ -998,7 +895,6 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------

 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const int kNumBlocks = 8;
  const VP8Encoder* const enc = it->enc_;
  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
  const int lambda = dqm->lambda_uv_;
@@ -1019,11 +915,8 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
    // Compute RD-score
    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
-    rd_uv.H  = VP8FixedCostsUV[mode];
    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
-    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
-      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
-    }
+    rd_uv.R += VP8FixedCostsUV[mode];

    SetRDScore(lambda, &rd_uv);
    if (mode == 0 || rd_uv.score < rd_best.score) {
@@ -1154,3 +1047,6 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
  return is_skipped;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/syntax.c
+++ b/src/enc/syntax.c
@@ -18,6 +18,10 @@
 #include "../webp/mux_types.h"         // ALPHA_FLAG
 #include "./vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Helper functions

@@ -421,3 +425,6 @@ int VP8EncWrite(VP8Encoder* const enc) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/token.c
+++ b/src/enc/token.c
@@ -20,9 +20,12 @@
 #include <stdlib.h>
 #include <string.h>

-#include "./cost.h"
 #include "./vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #if !defined(DISABLE_TOKEN_BUFFER)

 // we use pages to reduce the number of memcpy()
@@ -235,29 +238,6 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
  return 1;
 }

-// Size estimation
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
-  size_t size = 0;
-  const VP8Tokens* p = b->pages_;
-  if (b->error_) return 0;
-  while (p != NULL) {
-    const VP8Tokens* const next = p->next_;
-    const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
-      const int bit = token & (1 << 15);
-      if (token & FIXED_PROBA_BIT) {
-        size += VP8BitCost(bit, token & 0xffu);
-      } else {
-        size += VP8BitCost(bit, probas[token & 0x3fffu]);
-      }
-    }
-    p = next;
-  }
-  return size;
-}
-
 //------------------------------------------------------------------------------

 #else     // DISABLE_TOKEN_BUFFER
@@ -271,3 +251,6 @@ void VP8TBufferClear(VP8TBuffer* const b) {

 #endif    // !DISABLE_TOKEN_BUFFER

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/tree.c
+++ b/src/enc/tree.c
@@ -7,18 +7,23 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Coding of token probabilities, intra modes and segments.
+// Token probabilities
 //
 // Author: Skal (pascal.massimino@gmail.com)

 #include "./vp8enci.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Default probabilities

 // Paragraph 13.5
 const uint8_t
  VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -315,7 +320,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
  VP8EncIterator it;
  VP8IteratorInit(enc, &it);
  do {
-    const VP8MBInfo* const mb = it.mb_;
+    const VP8MBInfo* mb = it.mb_;
    const uint8_t* preds = it.preds_;
    if (enc->segment_hdr_.update_map_) {
      PutSegment(bw, mb->segment_, enc->proba_.segments_);
@@ -340,7 +345,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
      }
    }
    PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it));
+  } while (VP8IteratorNext(&it, 0));
 }

 //------------------------------------------------------------------------------
@@ -502,3 +507,6 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
  }
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -20,7 +20,7 @@
 #include "../utils/bit_writer.h"
 #include "../utils/thread.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -29,8 +29,8 @@ extern "C" {

 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 0
+#define ENC_MIN_VERSION 3
+#define ENC_REV_VERSION 1

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -74,7 +74,7 @@ typedef enum {   // Rate-distortion optimization levels
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
+//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@@ -248,19 +248,16 @@ typedef struct {
  int beta_;       // filter-susceptibility, range [0,255].
  int quant_;      // final segment quantizer.
  int fstrength_;  // final in-loop filtering strength
-  int max_edge_;   // max edge delta (for filtering strength)
-  int min_disto_;  // minimum distortion required to trigger filtering record
  // reactivities
  int lambda_i16_, lambda_i4_, lambda_uv_;
  int lambda_mode_, lambda_trellis_, tlambda_;
  int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
 } VP8SegmentInfo;

-// Handy transient struct to accumulate score and info during RD-optimization
+// Handy transcient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
-  score_t D, SD;              // Distortion, spectral distortion
-  score_t H, R, score;        // header bits, rate, score.
+  score_t D, SD, R, score;    // Distortion, spectral distortion, rate, score.
  int16_t y_dc_levels[16];    // Quantized levels for luma-DC, luma-AC, chroma.
  int16_t y_ac_levels[16][16];
  int16_t uv_levels[4 + 4][16];
@@ -274,11 +271,12 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
  int x_, y_;                      // current macroblock
+  int y_offset_, uv_offset_;       // offset to the luma / chroma planes
  int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // input samples
-  uint8_t*      yuv_out_;          // output samples
-  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
-  uint8_t*      yuv_p_;            // scratch buffer for prediction
+  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
+  uint8_t*      yuv_out_;          // ''
+  uint8_t*      yuv_out2_;         // ''
+  uint8_t*      yuv_p_;            // ''
  VP8Encoder*   enc_;              // back-pointer
  VP8MBInfo*    mb_;               // current macroblock
  VP8BitWriter* bw_;               // current bit-writer
@@ -294,43 +292,24 @@ typedef struct {
  uint64_t      uv_bits_;          // macroblock bit-cost for chroma
  LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
  int           do_trellis_;       // if true, perform extra level optimisation
-  int           count_down_;       // number of mb still to be processed
-  int           count_down0_;      // starting counter value (for progress)
+  int           done_;             // true when scan is finished
  int           percent0_;         // saved initial progress percent
-
-  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
-  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
-  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
-
-  uint8_t* y_top_;     // top luma samples at position 'x_'
-  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
-
-  // memory for storing y/u/v_left_ and yuv_in_/out_*
-  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
-  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;

  // in iterator.c
-// must be called first
+// must be called first.
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan
+// restart a scan.
 void VP8IteratorReset(VP8EncIterator* const it);
-// reset iterator position to row 'y'
-void VP8IteratorSetRow(VP8EncIterator* const it, int y);
-// set count down (=number of iterations to go)
-void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
-// return true if iteration is finished
-int VP8IteratorIsDone(const VP8EncIterator* const it);
-// Import uncompressed samples from source.
-// If tmp_32 is not NULL, import boundary samples too.
-// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
-void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
+// import samples from source
+void VP8IteratorImport(const VP8EncIterator* const it);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns false if not finished.
-int VP8IteratorNext(VP8EncIterator* const it);
-// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
-void VP8IteratorSaveBoundary(VP8EncIterator* const it);
+// go to next macroblock. Returns !done_. If *block_to_save is non-null, will
+// save the boundary values to top_/left_ arrays. block_to_save can be
+// it->yuv_out_ or it->yuv_in_.
+int VP8IteratorNext(VP8EncIterator* const it,
+                    const uint8_t* const block_to_save);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                        int final_delta_percent);
@@ -381,9 +360,6 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
                         const int16_t* const coeffs,
                         VP8TBuffer* const tokens);

-// Estimate the final coded size given a set of 'probas'.
-size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
-
 // unused for now
 void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);

@@ -459,9 +435,17 @@ struct VP8Encoder {
  VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
  uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
  uint32_t*  nz_;        // non-zero bit context: mb_w+1
+  uint8_t*   yuv_in_;    // input samples
+  uint8_t*   yuv_out_;   // output samples
+  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
+  uint8_t*   yuv_p_;     // scratch buffer for prediction
  uint8_t   *y_top_;     // top luma samples.
  uint8_t   *uv_top_;    // top u/v samples.
-                         // U and V are packed into 16 bytes (8 U + 8 V)
+                         // U and V are packed into 16 pixels (8 U + 8 V)
+  uint8_t   *y_left_;    // left luma samples (adressable from index -1 to 15).
+  uint8_t   *u_left_;    // left u samples (adressable from index -1 to 7)
+  uint8_t   *v_left_;    // left v samples (adressable from index -1 to 7)
+
  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };

@@ -557,13 +541,9 @@ void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
 void VP8AdjustFilterStrength(VP8EncIterator* const it);

-// returns the approximate filtering strength needed to smooth a edge
-// step of 'delta', given a sharpness parameter 'sharpness'.
-int VP8FilterStrengthFromDelta(int sharpness, int delta);
-
 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/enc/vp8l.c
+++ b/src/enc/vp8l.c
@@ -25,6 +25,10 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
 #define MAX_COLORS_FOR_GRAPH      64
@@ -164,6 +168,9 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
      }
      if (pred_entropy < 0.95 * non_pred_entropy) {
        enc->use_predict_ = 1;
+        // TODO(vikasa): Observed some correlation of cross_color transform with
+        // predict. Need to investigate this further and add separate heuristic
+        // for setting use_cross_color flag.
        enc->use_cross_color_ = 1;
      }
    }
@@ -444,12 +451,12 @@ static void StoreImageToBitMask(
      int bits, n_bits;
      int code, distance;

-      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
+      PrefixEncode(v->len, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes, 256 + code);
      VP8LWriteBits(bw, n_bits, bits);

      distance = PixOrCopyDistance(v);
-      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
+      PrefixEncode(distance, &code, &n_bits, &bits);
      WriteHuffmanCode(bw, codes + 4, code);
      VP8LWriteBits(bw, n_bits, bits);
    }
@@ -695,7 +702,7 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
  const int ccolor_transform_bits = enc->transform_bits_;
  const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
  const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
+  const int step = (quality == 0) ? 32 : 8;

  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
                          enc->argb_, enc->transform_data_);
@@ -820,7 +827,7 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
  }

  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
+    int inv_palette[MAX_PALETTE_SIZE] = { 0 };
    for (i = 0; i < palette_size; ++i) {
      const int color = (palette[i] >> 8) & 0xff;
      inv_palette[color] = i;
@@ -888,7 +895,7 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
  if (err != VP8_ENC_OK) goto Error;
  dst = enc->argb_;

-  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
+  row = WebPSafeMalloc((uint64_t)width, sizeof(*row));
  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;

  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
@@ -952,9 +959,6 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
  }
  enc->config_ = config;
  enc->pic_ = picture;
-
-  VP8LDspInit();
-
  return enc;
 }

@@ -1166,3 +1170,6 @@ int VP8LEncodeImage(const WebPConfig* const config,

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/vp8li.h
+++ b/src/enc/vp8li.h
@@ -19,7 +19,7 @@
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -63,7 +63,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -22,6 +22,10 @@

 // #define PRINT_MEMORY_INFO

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #ifdef PRINT_MEMORY_INFO
 #include <stdio.h>
 #endif
@@ -132,7 +136,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
  enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
  if (!config->low_memory) {
 #if !defined(DISABLE_TOKEN_BUFFER)
-    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
+    enc->use_tokens_ = (method >= 3) && !enc->do_search_;
 #endif
    if (enc->use_tokens_) {
      enc->num_parts_ = 1;   // doesn't work with multi-partition
@@ -153,7 +157,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //             non-zero: 196
 //             lf-stats: 2048
 //                total: 68635
-// Transient object sizes:
+// Transcient object sizes:
 //       VP8EncIterator: 352
 //         VP8ModeScore: 912
 //       VP8SegmentInfo: 532
@@ -171,16 +175,20 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  const int preds_h = 4 * mb_h + 1;
  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
  const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
+  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t);
+  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + ALIGN_CST;                        // align all
+  const size_t samples_size = (2 * top_stride +         // top-luma/u/v
+                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
+                               2 * ALIGN_CST)           // align all
+                               * sizeof(uint8_t);
  const size_t lf_stats_size =
      config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
  VP8Encoder* enc;
  uint8_t* mem;
  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
                      + ALIGN_CST                      // cache alignment
+                      + cache_size                     // working caches
                      + info_size                      // modes info
                      + preds_size                     // prediction modes
                      + samples_size                   // top/left samples
@@ -191,15 +199,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  printf("===================================\n");
  printf("Memory used:\n"
         "             encoder: %ld\n"
+         "         block cache: %ld\n"
         "                info: %ld\n"
         "               preds: %ld\n"
         "         top samples: %ld\n"
         "            non-zero: %ld\n"
         "            lf-stats: %ld\n"
         "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
         preds_size, samples_size, nz_size, lf_stats_size, size);
-  printf("Transient object sizes:\n"
+  printf("Transcient object sizes:\n"
         "      VP8EncIterator: %ld\n"
         "        VP8ModeScore: %ld\n"
         "      VP8SegmentInfo: %ld\n"
@@ -224,11 +233,19 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->mb_w_ = mb_w;
  enc->mb_h_ = mb_h;
  enc->preds_w_ = preds_w;
+  enc->yuv_in_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_out2_ = (uint8_t*)mem;
+  mem += YUV_SIZE;
+  enc->yuv_p_ = (uint8_t*)mem;
+  mem += PRED_SIZE;
  enc->mb_info_ = (VP8MBInfo*)mem;
  mem += info_size;
  enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
  mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
+  enc->nz_ = 1 + (uint32_t*)mem;
  mem += nz_size;
  enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
  mem += lf_stats_size;
@@ -238,7 +255,13 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
  enc->y_top_ = (uint8_t*)mem;
  enc->uv_top_ = enc->y_top_ + top_stride;
  mem += 2 * top_stride;
-  assert(mem <= (uint8_t*)enc + size);
+  mem = (uint8_t*)DO_ALIGN(mem + 1);
+  enc->y_left_ = (uint8_t*)mem;
+  mem += 16 + 16;
+  enc->u_left_ = (uint8_t*)mem;
+  mem += 16;
+  enc->v_left_ = (uint8_t*)mem;
+  mem += 8;

  enc->config_ = config;
  enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
@@ -277,7 +300,7 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
 //------------------------------------------------------------------------------

 static double GetPSNR(uint64_t err, uint64_t size) {
-  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
+  return err ? 10. * log10(255. * 255. * size / err) : 99.;
 }

 static void FinalizePSNR(const VP8Encoder* const enc) {
@@ -354,17 +377,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
    VP8Encoder* enc = NULL;
    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
      // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
-      }
+      if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
    }

    enc = InitVP8Encoder(config, pic);
@@ -402,3 +415,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
  return ok;
 }

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/mux/Makefile.am
+++ b/src/mux/Makefile.am
@@ -13,6 +13,6 @@ libwebpmuxinclude_HEADERS += ../webp/mux_types.h
 libwebpmuxinclude_HEADERS += ../webp/types.h

 libwebpmux_la_LIBADD = ../libwebp.la
-libwebpmux_la_LDFLAGS = -no-undefined -version-info 1:0:0
+libwebpmux_la_LDFLAGS = -no-undefined -version-info 0:1:0
 libwebpmuxincludedir = $(includedir)/webp
 pkgconfig_DATA = libwebpmux.pc
--- a/src/mux/muxedit.c
+++ b/src/mux/muxedit.c
@@ -16,6 +16,10 @@
 #include "./muxi.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Life of a mux object.

@@ -35,22 +39,21 @@ WebPMux* WebPNewInternal(int version) {
  }
 }

-// Delete all images in 'wpi_list'.
-static void DeleteAllImages(WebPMuxImage** const wpi_list) {
-  while (*wpi_list != NULL) {
-    *wpi_list = MuxImageDelete(*wpi_list);
+static void DeleteAllChunks(WebPChunk** const chunk_list) {
+  while (*chunk_list) {
+    *chunk_list = ChunkDelete(*chunk_list);
  }
 }

 static void MuxRelease(WebPMux* const mux) {
  if (mux == NULL) return;
-  DeleteAllImages(&mux->images_);
-  ChunkListDelete(&mux->vp8x_);
-  ChunkListDelete(&mux->iccp_);
-  ChunkListDelete(&mux->anim_);
-  ChunkListDelete(&mux->exif_);
-  ChunkListDelete(&mux->xmp_);
-  ChunkListDelete(&mux->unknown_);
+  MuxImageDeleteAll(&mux->images_);
+  DeleteAllChunks(&mux->vp8x_);
+  DeleteAllChunks(&mux->iccp_);
+  DeleteAllChunks(&mux->anim_);
+  DeleteAllChunks(&mux->exif_);
+  DeleteAllChunks(&mux->xmp_);
+  DeleteAllChunks(&mux->unknown_);
 }

 void WebPMuxDelete(WebPMux* mux) {
@@ -65,56 +68,79 @@ void WebPMuxDelete(WebPMux* mux) {
 // Handy MACRO, makes MuxSet() very symmetric to MuxGet().
 #define SWITCH_ID_LIST(INDEX, LIST)                                            \
  if (idx == (INDEX)) {                                                        \
-    err = ChunkAssignData(&chunk, data, copy_data, tag);                       \
+    err = ChunkAssignData(&chunk, data, copy_data, kChunks[(INDEX)].tag);      \
    if (err == WEBP_MUX_OK) {                                                  \
      err = ChunkSetNth(&chunk, (LIST), nth);                                  \
    }                                                                          \
    return err;                                                                \
  }

-static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth,
+static WebPMuxError MuxSet(WebPMux* const mux, CHUNK_INDEX idx, uint32_t nth,
                           const WebPData* const data, int copy_data) {
  WebPChunk chunk;
  WebPMuxError err = WEBP_MUX_NOT_FOUND;
-  const CHUNK_INDEX idx = ChunkGetIndexFromTag(tag);
  assert(mux != NULL);
  assert(!IsWPI(kChunks[idx].id));

  ChunkInit(&chunk);
-  SWITCH_ID_LIST(IDX_VP8X,    &mux->vp8x_);
-  SWITCH_ID_LIST(IDX_ICCP,    &mux->iccp_);
-  SWITCH_ID_LIST(IDX_ANIM,    &mux->anim_);
-  SWITCH_ID_LIST(IDX_EXIF,    &mux->exif_);
-  SWITCH_ID_LIST(IDX_XMP,     &mux->xmp_);
-  SWITCH_ID_LIST(IDX_UNKNOWN, &mux->unknown_);
+  SWITCH_ID_LIST(IDX_VP8X, &mux->vp8x_);
+  SWITCH_ID_LIST(IDX_ICCP, &mux->iccp_);
+  SWITCH_ID_LIST(IDX_ANIM, &mux->anim_);
+  SWITCH_ID_LIST(IDX_EXIF, &mux->exif_);
+  SWITCH_ID_LIST(IDX_XMP,  &mux->xmp_);
+  if (idx == IDX_UNKNOWN && data->size > TAG_SIZE) {
+    // For raw-data unknown chunk, the first four bytes should be the tag to be
+    // used for the chunk.
+    const WebPData tmp = { data->bytes + TAG_SIZE, data->size - TAG_SIZE };
+    err = ChunkAssignData(&chunk, &tmp, copy_data, GetLE32(data->bytes + 0));
+    if (err == WEBP_MUX_OK)
+      err = ChunkSetNth(&chunk, &mux->unknown_, nth);
+  }
  return err;
 }
 #undef SWITCH_ID_LIST

+static WebPMuxError MuxAddChunk(WebPMux* const mux, uint32_t nth, uint32_t tag,
+                                const uint8_t* data, size_t size,
+                                int copy_data) {
+  const CHUNK_INDEX idx = ChunkGetIndexFromTag(tag);
+  const WebPData chunk_data = { data, size };
+  assert(mux != NULL);
+  assert(size <= MAX_CHUNK_PAYLOAD);
+  assert(idx != IDX_NIL);
+  return MuxSet(mux, idx, nth, &chunk_data, copy_data);
+}
+
 // Create data for frame/fragment given image data, offsets and duration.
 static WebPMuxError CreateFrameFragmentData(
-    int width, int height, const WebPMuxFrameInfo* const info, int is_frame,
+    const WebPData* const image, int x_offset, int y_offset, int duration,
+    WebPMuxAnimDispose dispose_method, int is_lossless, int is_frame,
    WebPData* const frame_frgm) {
+  int width;
+  int height;
  uint8_t* frame_frgm_bytes;
  const size_t frame_frgm_size = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].size;

-  assert(width > 0 && height > 0 && info->duration >= 0);
-  assert(info->dispose_method == (info->dispose_method & 1));
+  const int ok = is_lossless ?
+      VP8LGetInfo(image->bytes, image->size, &width, &height, NULL) :
+      VP8GetInfo(image->bytes, image->size, image->size, &width, &height);
+  if (!ok) return WEBP_MUX_INVALID_ARGUMENT;
+
+  assert(width > 0 && height > 0 && duration >= 0);
+  assert(dispose_method == (dispose_method & 1));
  // Note: assertion on upper bounds is done in PutLE24().

  frame_frgm_bytes = (uint8_t*)malloc(frame_frgm_size);
  if (frame_frgm_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;

-  PutLE24(frame_frgm_bytes + 0, info->x_offset / 2);
-  PutLE24(frame_frgm_bytes + 3, info->y_offset / 2);
+  PutLE24(frame_frgm_bytes + 0, x_offset / 2);
+  PutLE24(frame_frgm_bytes + 3, y_offset / 2);

  if (is_frame) {
    PutLE24(frame_frgm_bytes + 6, width - 1);
    PutLE24(frame_frgm_bytes + 9, height - 1);
-    PutLE24(frame_frgm_bytes + 12, info->duration);
-    frame_frgm_bytes[15] =
-        (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
-        (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
+    PutLE24(frame_frgm_bytes + 12, duration);
+    frame_frgm_bytes[15] = (dispose_method & 1);
  }

  frame_frgm->bytes = frame_frgm_bytes;
@@ -167,9 +193,15 @@ static WebPMuxError DeleteChunks(WebPChunk** chunk_list, uint32_t tag) {

 static WebPMuxError MuxDeleteAllNamedData(WebPMux* const mux, uint32_t tag) {
  const WebPChunkId id = ChunkGetIdFromTag(tag);
+  WebPChunk** chunk_list;
+
  assert(mux != NULL);
  if (IsWPI(id)) return WEBP_MUX_INVALID_ARGUMENT;
-  return DeleteChunks(MuxGetChunkListFromId(mux, id), tag);
+
+  chunk_list = MuxGetChunkListFromId(mux, id);
+  if (chunk_list == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+
+  return DeleteChunks(chunk_list, tag);
 }

 //------------------------------------------------------------------------------
@@ -177,12 +209,14 @@ static WebPMuxError MuxDeleteAllNamedData(WebPMux* const mux, uint32_t tag) {

 WebPMuxError WebPMuxSetChunk(WebPMux* mux, const char fourcc[4],
                             const WebPData* chunk_data, int copy_data) {
+  CHUNK_INDEX idx;
  uint32_t tag;
  WebPMuxError err;
  if (mux == NULL || fourcc == NULL || chunk_data == NULL ||
      chunk_data->bytes == NULL || chunk_data->size > MAX_CHUNK_PAYLOAD) {
    return WEBP_MUX_INVALID_ARGUMENT;
  }
+  idx = ChunkGetIndexFromFourCC(fourcc);
  tag = ChunkGetTagFromFourCC(fourcc);

  // Delete existing chunk(s) with the same 'fourcc'.
@@ -190,7 +224,7 @@ WebPMuxError WebPMuxSetChunk(WebPMux* mux, const char fourcc[4],
  if (err != WEBP_MUX_OK && err != WEBP_MUX_NOT_FOUND) return err;

  // Add the given chunk.
-  return MuxSet(mux, tag, 1, chunk_data, copy_data);
+  return MuxSet(mux, idx, 1, chunk_data, copy_data);
 }

 // Creates a chunk from given 'data' and sets it as 1st chunk in 'chunk_list'.
@@ -225,9 +259,7 @@ static WebPMuxError SetAlphaAndImageChunks(
                             &wpi->alpha_);
    if (err != WEBP_MUX_OK) return err;
  }
-  err = AddDataToChunkList(&image, copy_data, image_tag, &wpi->img_);
-  if (err != WEBP_MUX_OK) return err;
-  return MuxImageFinalize(wpi) ? WEBP_MUX_OK : WEBP_MUX_INVALID_ARGUMENT;
+  return AddDataToChunkList(&image, copy_data, image_tag, &wpi->img_);
 }

 WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
@@ -243,7 +275,7 @@ WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,

  if (mux->images_ != NULL) {
    // Only one 'simple image' can be added in mux. So, remove present images.
-    DeleteAllImages(&mux->images_);
+    MuxImageDeleteAll(&mux->images_);
  }

  MuxImageInit(&wpi);
@@ -301,25 +333,24 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
  assert(wpi.img_ != NULL);  // As SetAlphaAndImageChunks() was successful.

  {
-    WebPData frame_frgm;
+    const int is_lossless = (wpi.img_->tag_ == kChunks[IDX_VP8L].tag);
+    const int x_offset = frame->x_offset & ~1;  // Snap offsets to even.
+    const int y_offset = frame->y_offset & ~1;
+    const int duration = is_frame ? frame->duration : 1 /* unused */;
+    const WebPMuxAnimDispose dispose_method =
+        is_frame ? frame->dispose_method : 0 /* unused */;
    const uint32_t tag = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].tag;
-    WebPMuxFrameInfo tmp = *frame;
-    tmp.x_offset &= ~1;  // Snap offsets to even.
-    tmp.y_offset &= ~1;
-    if (!is_frame) {  // Reset unused values.
-      tmp.duration = 1;
-      tmp.dispose_method = WEBP_MUX_DISPOSE_NONE;
-      tmp.blend_method = WEBP_MUX_BLEND;
-    }
-    if (tmp.x_offset < 0 || tmp.x_offset >= MAX_POSITION_OFFSET ||
-        tmp.y_offset < 0 || tmp.y_offset >= MAX_POSITION_OFFSET ||
-        (tmp.duration < 0 || tmp.duration >= MAX_DURATION) ||
-        tmp.dispose_method != (tmp.dispose_method & 1)) {
+    WebPData frame_frgm;
+    if (x_offset < 0 || x_offset >= MAX_POSITION_OFFSET ||
+        y_offset < 0 || y_offset >= MAX_POSITION_OFFSET ||
+        (duration < 0 || duration >= MAX_DURATION) ||
+        dispose_method != (dispose_method & 1)) {
      err = WEBP_MUX_INVALID_ARGUMENT;
      goto Err;
    }
-    err = CreateFrameFragmentData(wpi.width_, wpi.height_, &tmp, is_frame,
-                                  &frame_frgm);
+    err = CreateFrameFragmentData(&wpi.img_->data_, x_offset, y_offset,
+                                  duration, dispose_method, is_lossless,
+                                  is_frame, &frame_frgm);
    if (err != WEBP_MUX_OK) goto Err;
    // Add frame/fragment chunk (with copy_data = 1).
    err = AddDataToChunkList(&frame_frgm, 1, tag, &wpi.header_);
@@ -343,7 +374,6 @@ WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux,
                                       const WebPMuxAnimParams* params) {
  WebPMuxError err;
  uint8_t data[ANIM_CHUNK_SIZE];
-  const WebPData anim = { data, ANIM_CHUNK_SIZE };

  if (mux == NULL || params == NULL) return WEBP_MUX_INVALID_ARGUMENT;
  if (params->loop_count < 0 || params->loop_count >= MAX_LOOP_COUNT) {
@@ -357,7 +387,7 @@ WebPMuxError WebPMuxSetAnimationParams(WebPMux* mux,
  // Set the animation parameters.
  PutLE32(data, params->bgcolor);
  PutLE16(data + 4, params->loop_count);
-  return MuxSet(mux, kChunks[IDX_ANIM].tag, 1, &anim, 1);
+  return MuxAddChunk(mux, 1, kChunks[IDX_ANIM].tag, data, sizeof(data), 1);
 }

 //------------------------------------------------------------------------------
@@ -394,23 +424,40 @@ static WebPMuxError GetFrameFragmentInfo(
  return WEBP_MUX_OK;
 }

+WebPMuxError MuxGetImageWidthHeight(const WebPChunk* const image_chunk,
+                                    int* const width, int* const height) {
+  const uint32_t tag = image_chunk->tag_;
+  const WebPData* const data = &image_chunk->data_;
+  int w, h;
+  int ok;
+  assert(image_chunk != NULL);
+  assert(tag == kChunks[IDX_VP8].tag || tag ==  kChunks[IDX_VP8L].tag);
+  ok = (tag == kChunks[IDX_VP8].tag) ?
+      VP8GetInfo(data->bytes, data->size, data->size, &w, &h) :
+      VP8LGetInfo(data->bytes, data->size, &w, &h, NULL);
+  if (ok) {
+    *width = w;
+    *height = h;
+    return WEBP_MUX_OK;
+  } else {
+    return WEBP_MUX_BAD_DATA;
+  }
+}
+
 static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
                                 int* const x_offset, int* const y_offset,
                                 int* const duration,
                                 int* const width, int* const height) {
+  const WebPChunk* const image_chunk = wpi->img_;
  const WebPChunk* const frame_frgm_chunk = wpi->header_;
-  WebPMuxError err;
-  assert(wpi != NULL);
-  assert(frame_frgm_chunk != NULL);

  // Get offsets and duration from ANMF/FRGM chunk.
-  err = GetFrameFragmentInfo(frame_frgm_chunk, x_offset, y_offset, duration);
+  const WebPMuxError err =
+      GetFrameFragmentInfo(frame_frgm_chunk, x_offset, y_offset, duration);
  if (err != WEBP_MUX_OK) return err;

  // Get width and height from VP8/VP8L chunk.
-  if (width != NULL) *width = wpi->width_;
-  if (height != NULL) *height = wpi->height_;
-  return WEBP_MUX_OK;
+  return MuxGetImageWidthHeight(image_chunk, width, height);
 }

 static WebPMuxError GetImageCanvasWidthHeight(
@@ -424,12 +471,10 @@ static WebPMuxError GetImageCanvasWidthHeight(
  assert(wpi != NULL);
  assert(wpi->img_ != NULL);

-  if (wpi->next_ != NULL) {
+  if (wpi->next_) {
    int max_x = 0;
    int max_y = 0;
    int64_t image_area = 0;
-    // if we have a chain of wpi's, header_ is necessarily set
-    assert(wpi->header_ != NULL);
    // Aggregate the bounding box for animation frames & fragmented images.
    for (; wpi != NULL; wpi = wpi->next_) {
      int x_offset = 0, y_offset = 0, duration = 0, w = 0, h = 0;
@@ -457,9 +502,13 @@ static WebPMuxError GetImageCanvasWidthHeight(
      return WEBP_MUX_INVALID_ARGUMENT;
    }
  } else {
-    // For a single image, canvas dimensions are same as image dimensions.
-    *width = wpi->width_;
-    *height = wpi->height_;
+    // For a single image, extract the width & height from VP8/VP8L image-data.
+    int w, h;
+    const WebPChunk* const image_chunk = wpi->img_;
+    const WebPMuxError err = MuxGetImageWidthHeight(image_chunk, &w, &h);
+    if (err != WEBP_MUX_OK) return err;
+    *width = w;
+    *height = h;
  }
  return WEBP_MUX_OK;
 }
@@ -475,7 +524,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
  int width = 0;
  int height = 0;
  uint8_t data[VP8X_CHUNK_SIZE];
-  const WebPData vp8x = { data, VP8X_CHUNK_SIZE };
+  const size_t data_size = VP8X_CHUNK_SIZE;
  const WebPMuxImage* images = NULL;

  assert(mux != NULL);
@@ -528,8 +577,9 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
    return WEBP_MUX_INVALID_ARGUMENT;
  }

-  if (MuxHasAlpha(images)) {
-    // This means some frames explicitly/implicitly contain alpha.
+  if (MuxHasLosslessImages(images)) {
+    // We have a file with a VP8X chunk having some lossless images.
+    // As lossless images implicitly contain alpha, force ALPHA_FLAG to be true.
    // Note: This 'flags' update must NOT be done for a lossless image
    // without a VP8X chunk!
    flags |= ALPHA_FLAG;
@@ -539,7 +589,8 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
  PutLE24(data + 4, width - 1);   // canvas width.
  PutLE24(data + 7, height - 1);  // canvas height.

-  return MuxSet(mux, kChunks[IDX_VP8X].tag, 1, &vp8x, 1);
+  err = MuxAddChunk(mux, 1, kChunks[IDX_VP8X].tag, data, data_size, 1);
+  return err;
 }

 // Cleans up 'mux' by removing any unnecessary chunks.
@@ -578,25 +629,6 @@ static WebPMuxError MuxCleanup(WebPMux* const mux) {
  return WEBP_MUX_OK;
 }

-// Total size of a list of images.
-static size_t ImageListDiskSize(const WebPMuxImage* wpi_list) {
-  size_t size = 0;
-  while (wpi_list != NULL) {
-    size += MuxImageDiskSize(wpi_list);
-    wpi_list = wpi_list->next_;
-  }
-  return size;
-}
-
-// Write out the given list of images into 'dst'.
-static uint8_t* ImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst) {
-  while (wpi_list != NULL) {
-    dst = MuxImageEmit(wpi_list, dst);
-    wpi_list = wpi_list->next_;
-  }
-  return dst;
-}
-
 WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
  size_t size = 0;
  uint8_t* data = NULL;
@@ -614,10 +646,10 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
  if (err != WEBP_MUX_OK) return err;

  // Allocate data.
-  size = ChunkListDiskSize(mux->vp8x_) + ChunkListDiskSize(mux->iccp_)
-       + ChunkListDiskSize(mux->anim_) + ImageListDiskSize(mux->images_)
-       + ChunkListDiskSize(mux->exif_) + ChunkListDiskSize(mux->xmp_)
-       + ChunkListDiskSize(mux->unknown_) + RIFF_HEADER_SIZE;
+  size = ChunksListDiskSize(mux->vp8x_) + ChunksListDiskSize(mux->iccp_)
+       + ChunksListDiskSize(mux->anim_) + MuxImageListDiskSize(mux->images_)
+       + ChunksListDiskSize(mux->exif_) + ChunksListDiskSize(mux->xmp_)
+       + ChunksListDiskSize(mux->unknown_) + RIFF_HEADER_SIZE;

  data = (uint8_t*)malloc(size);
  if (data == NULL) return WEBP_MUX_MEMORY_ERROR;
@@ -627,7 +659,7 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {
  dst = ChunkListEmit(mux->vp8x_, dst);
  dst = ChunkListEmit(mux->iccp_, dst);
  dst = ChunkListEmit(mux->anim_, dst);
-  dst = ImageListEmit(mux->images_, dst);
+  dst = MuxImageListEmit(mux->images_, dst);
  dst = ChunkListEmit(mux->exif_, dst);
  dst = ChunkListEmit(mux->xmp_, dst);
  dst = ChunkListEmit(mux->unknown_, dst);
@@ -650,3 +682,6 @@ WebPMuxError WebPMuxAssemble(WebPMux* mux, WebPData* assembled_data) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/mux/muxi.h
+++ b/src/mux/muxi.h
@@ -19,7 +19,7 @@
 #include "../dec/vp8li.h"
 #include "../webp/mux.h"

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

@@ -27,8 +27,8 @@ extern "C" {
 // Defines and constants.

 #define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 0
+#define MUX_MIN_VERSION 1
+#define MUX_REV_VERSION 1

 // Chunk object.
 typedef struct WebPChunk WebPChunk;
@@ -48,10 +48,6 @@ struct WebPMuxImage {
  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF/WEBP_CHUNK_FRGM.
  WebPChunk*  alpha_;       // Corresponds to WEBP_CHUNK_ALPHA.
  WebPChunk*  img_;         // Corresponds to WEBP_CHUNK_IMAGE.
-  WebPChunk*  unknown_;     // Corresponds to WEBP_CHUNK_UNKNOWN.
-  int         width_;
-  int         height_;
-  int         has_alpha_;   // Through ALPH chunk or as part of VP8L.
  int         is_partial_;  // True if only some of the chunks are filled.
  WebPMuxImage* next_;
 };
@@ -105,10 +101,10 @@ extern const ChunkInfo kChunks[IDX_LAST_CHUNK];
 // Initialize.
 void ChunkInit(WebPChunk* const chunk);

-// Get chunk index from chunk tag. Returns IDX_UNKNOWN if not found.
+// Get chunk index from chunk tag. Returns IDX_NIL if not found.
 CHUNK_INDEX ChunkGetIndexFromTag(uint32_t tag);

-// Get chunk id from chunk tag. Returns WEBP_CHUNK_UNKNOWN if not found.
+// Get chunk id from chunk tag. Returns WEBP_CHUNK_NIL if not found.
 WebPChunkId ChunkGetIdFromTag(uint32_t tag);

 // Convert a fourcc string to a tag.
@@ -137,9 +133,6 @@ WebPChunk* ChunkRelease(WebPChunk* const chunk);
 // Deletes given chunk & returns chunk->next_.
 WebPChunk* ChunkDelete(WebPChunk* const chunk);

-// Deletes all chunks in the given chunk list.
-void ChunkListDelete(WebPChunk** const chunk_list);
-
 // Returns size of the chunk including chunk header and padding byte (if any).
 static WEBP_INLINE size_t SizeWithPadding(size_t chunk_size) {
  return CHUNK_HEADER_SIZE + ((chunk_size + 1) & ~1U);
@@ -153,11 +146,15 @@ static WEBP_INLINE size_t ChunkDiskSize(const WebPChunk* chunk) {
 }

 // Total size of a list of chunks.
-size_t ChunkListDiskSize(const WebPChunk* chunk_list);
+size_t ChunksListDiskSize(const WebPChunk* chunk_list);

 // Write out the given list of chunks into 'dst'.
 uint8_t* ChunkListEmit(const WebPChunk* chunk_list, uint8_t* dst);

+// Get the width & height of image stored in 'image_chunk'.
+WebPMuxError MuxGetImageWidthHeight(const WebPChunk* const image_chunk,
+                                    int* const width, int* const height);
+
 //------------------------------------------------------------------------------
 // MuxImage object management.

@@ -171,14 +168,13 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi);
 // 'wpi' can be NULL.
 WebPMuxImage* MuxImageDelete(WebPMuxImage* const wpi);

+// Delete all images in 'wpi_list'.
+void MuxImageDeleteAll(WebPMuxImage** const wpi_list);
+
 // Count number of images matching the given tag id in the 'wpi_list'.
 // If id == WEBP_CHUNK_NIL, all images will be matched.
 int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id);

-// Update width/height/has_alpha info from chunks within wpi.
-// Also remove ALPH chunk if not needed.
-int MuxImageFinalize(WebPMuxImage* const wpi);
-
 // Check if given ID corresponds to an image related chunk.
 static WEBP_INLINE int IsWPI(WebPChunkId id) {
  switch (id) {
@@ -190,6 +186,19 @@ static WEBP_INLINE int IsWPI(WebPChunkId id) {
  }
 }

+// Get a reference to appropriate chunk list within an image given chunk tag.
+static WEBP_INLINE WebPChunk** MuxImageGetListFromId(
+    const WebPMuxImage* const wpi, WebPChunkId id) {
+  assert(wpi != NULL);
+  switch (id) {
+    case WEBP_CHUNK_ANMF:
+    case WEBP_CHUNK_FRGM:  return (WebPChunk**)&wpi->header_;
+    case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
+    case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
+    default: return NULL;
+  }
+}
+
 // Pushes 'wpi' at the end of 'wpi_list'.
 WebPMuxError MuxImagePush(const WebPMuxImage* wpi, WebPMuxImage** wpi_list);

@@ -203,27 +212,38 @@ WebPMuxError MuxImageGetNth(const WebPMuxImage** wpi_list, uint32_t nth,
 // Total size of the given image.
 size_t MuxImageDiskSize(const WebPMuxImage* const wpi);

+// Total size of a list of images.
+size_t MuxImageListDiskSize(const WebPMuxImage* wpi_list);
+
 // Write out the given image into 'dst'.
 uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst);

+// Write out the given list of images into 'dst'.
+uint8_t* MuxImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst);
+
 //------------------------------------------------------------------------------
 // Helper methods for mux.

-// Checks if the given image list contains at least one image with alpha.
-int MuxHasAlpha(const WebPMuxImage* images);
+// Checks if the given image list contains at least one lossless image.
+int MuxHasLosslessImages(const WebPMuxImage* images);

 // Write out RIFF header into 'data', given total data size 'size'.
 uint8_t* MuxEmitRiffHeader(uint8_t* const data, size_t size);

 // Returns the list where chunk with given ID is to be inserted in mux.
+// Return value is NULL if this chunk should be inserted in mux->images_ list
+// or if 'id' is not known.
 WebPChunk** MuxGetChunkListFromId(const WebPMux* mux, WebPChunkId id);

+// Validates that the given mux has a single image.
+WebPMuxError MuxValidateForImage(const WebPMux* const mux);
+
 // Validates the given mux object.
 WebPMuxError MuxValidate(const WebPMux* const mux);

 //------------------------------------------------------------------------------

-#ifdef __cplusplus
+#if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

--- a/src/mux/muxinternal.c
+++ b/src/mux/muxinternal.c
@@ -16,6 +16,10 @@
 #include "./muxi.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 #define UNDEFINED_CHUNK_SIZE (-1)

 const ChunkInfo kChunks[] = {
@@ -29,7 +33,7 @@ const ChunkInfo kChunks[] = {
  { MKFOURCC('V', 'P', '8', 'L'),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
  { MKFOURCC('E', 'X', 'I', 'F'),  WEBP_CHUNK_EXIF,    UNDEFINED_CHUNK_SIZE },
  { MKFOURCC('X', 'M', 'P', ' '),  WEBP_CHUNK_XMP,     UNDEFINED_CHUNK_SIZE },
-  { NIL_TAG,                       WEBP_CHUNK_UNKNOWN, UNDEFINED_CHUNK_SIZE },
+  { MKFOURCC('U', 'N', 'K', 'N'),  WEBP_CHUNK_UNKNOWN, UNDEFINED_CHUNK_SIZE },

  { NIL_TAG,                       WEBP_CHUNK_NIL,     UNDEFINED_CHUNK_SIZE }
 };
@@ -66,9 +70,9 @@ WebPChunk* ChunkRelease(WebPChunk* const chunk) {
 CHUNK_INDEX ChunkGetIndexFromTag(uint32_t tag) {
  int i;
  for (i = 0; kChunks[i].tag != NIL_TAG; ++i) {
-    if (tag == kChunks[i].tag) return (CHUNK_INDEX)i;
+    if (tag == kChunks[i].tag) return i;
  }
-  return IDX_UNKNOWN;
+  return IDX_NIL;
 }

 WebPChunkId ChunkGetIdFromTag(uint32_t tag) {
@@ -76,7 +80,7 @@ WebPChunkId ChunkGetIdFromTag(uint32_t tag) {
  for (i = 0; kChunks[i].tag != NIL_TAG; ++i) {
    if (tag == kChunks[i].tag) return kChunks[i].id;
  }
-  return WEBP_CHUNK_UNKNOWN;
+  return WEBP_CHUNK_NIL;
 }

 uint32_t ChunkGetTagFromFourCC(const char fourcc[4]) {
@@ -85,7 +89,8 @@ uint32_t ChunkGetTagFromFourCC(const char fourcc[4]) {

 CHUNK_INDEX ChunkGetIndexFromFourCC(const char fourcc[4]) {
  const uint32_t tag = ChunkGetTagFromFourCC(fourcc);
-  return ChunkGetIndexFromTag(tag);
+  const CHUNK_INDEX idx = ChunkGetIndexFromTag(tag);
+  return (idx == IDX_NIL) ? IDX_UNKNOWN : idx;
 }

 //------------------------------------------------------------------------------
@@ -183,15 +188,18 @@ WebPChunk* ChunkDelete(WebPChunk* const chunk) {
  return next;
 }

-void ChunkListDelete(WebPChunk** const chunk_list) {
-  while (*chunk_list != NULL) {
-    *chunk_list = ChunkDelete(*chunk_list);
-  }
-}
-
 //------------------------------------------------------------------------------
 // Chunk serialization methods.

+size_t ChunksListDiskSize(const WebPChunk* chunk_list) {
+  size_t size = 0;
+  while (chunk_list != NULL) {
+    size += ChunkDiskSize(chunk_list);
+    chunk_list = chunk_list->next_;
+  }
+  return size;
+}
+
 static uint8_t* ChunkEmit(const WebPChunk* const chunk, uint8_t* dst) {
  const size_t chunk_size = chunk->data_.size;
  assert(chunk);
@@ -213,15 +221,6 @@ uint8_t* ChunkListEmit(const WebPChunk* chunk_list, uint8_t* dst) {
  return dst;
 }

-size_t ChunkListDiskSize(const WebPChunk* chunk_list) {
-  size_t size = 0;
-  while (chunk_list != NULL) {
-    size += ChunkDiskSize(chunk_list);
-    chunk_list = chunk_list->next_;
-  }
-  return size;
-}
-
 //------------------------------------------------------------------------------
 // Life of a MuxImage object.

@@ -236,7 +235,6 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) {
  ChunkDelete(wpi->header_);
  ChunkDelete(wpi->alpha_);
  ChunkDelete(wpi->img_);
-  ChunkListDelete(&wpi->unknown_);

  next = wpi->next_;
  MuxImageInit(wpi);
@@ -246,19 +244,6 @@ WebPMuxImage* MuxImageRelease(WebPMuxImage* const wpi) {
 //------------------------------------------------------------------------------
 // MuxImage search methods.

-// Get a reference to appropriate chunk list within an image given chunk tag.
-static WebPChunk** GetChunkListFromId(const WebPMuxImage* const wpi,
-                                      WebPChunkId id) {
-  assert(wpi != NULL);
-  switch (id) {
-    case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:  return (WebPChunk**)&wpi->header_;
-    case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
-    case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
-    default: return NULL;
-  }
-}
-
 int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id) {
  int count = 0;
  const WebPMuxImage* current;
@@ -266,7 +251,7 @@ int MuxImageCount(const WebPMuxImage* wpi_list, WebPChunkId id) {
    if (id == WEBP_CHUNK_NIL) {
      ++count;  // Special case: count all images.
    } else {
-      const WebPChunk* const wpi_chunk = *GetChunkListFromId(current, id);
+      const WebPChunk* const wpi_chunk = *MuxImageGetListFromId(current, id);
      if (wpi_chunk != NULL) {
        const WebPChunkId wpi_chunk_id = ChunkGetIdFromTag(wpi_chunk->tag_);
        if (wpi_chunk_id == id) ++count;  // Count images with a matching 'id'.
@@ -335,6 +320,12 @@ WebPMuxImage* MuxImageDelete(WebPMuxImage* const wpi) {
  return next;
 }

+void MuxImageDeleteAll(WebPMuxImage** const wpi_list) {
+  while (*wpi_list != NULL) {
+    *wpi_list = MuxImageDelete(*wpi_list);
+  }
+}
+
 WebPMuxError MuxImageDeleteNth(WebPMuxImage** wpi_list, uint32_t nth) {
  assert(wpi_list);
  if (!SearchImageToGetOrDelete(wpi_list, nth, &wpi_list)) {
@@ -368,7 +359,15 @@ size_t MuxImageDiskSize(const WebPMuxImage* const wpi) {
  if (wpi->header_ != NULL) size += ChunkDiskSize(wpi->header_);
  if (wpi->alpha_ != NULL) size += ChunkDiskSize(wpi->alpha_);
  if (wpi->img_ != NULL) size += ChunkDiskSize(wpi->img_);
-  if (wpi->unknown_ != NULL) size += ChunkListDiskSize(wpi->unknown_);
+  return size;
+}
+
+size_t MuxImageListDiskSize(const WebPMuxImage* wpi_list) {
+  size_t size = 0;
+  while (wpi_list != NULL) {
+    size += MuxImageDiskSize(wpi_list);
+    wpi_list = wpi_list->next_;
+  }
  return size;
 }

@@ -400,16 +399,26 @@ uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst) {
  }
  if (wpi->alpha_ != NULL) dst = ChunkEmit(wpi->alpha_, dst);
  if (wpi->img_ != NULL) dst = ChunkEmit(wpi->img_, dst);
-  if (wpi->unknown_ != NULL) dst = ChunkListEmit(wpi->unknown_, dst);
+  return dst;
+}
+
+uint8_t* MuxImageListEmit(const WebPMuxImage* wpi_list, uint8_t* dst) {
+  while (wpi_list != NULL) {
+    dst = MuxImageEmit(wpi_list, dst);
+    wpi_list = wpi_list->next_;
+  }
  return dst;
 }

 //------------------------------------------------------------------------------
 // Helper methods for mux.

-int MuxHasAlpha(const WebPMuxImage* images) {
+int MuxHasLosslessImages(const WebPMuxImage* images) {
  while (images != NULL) {
-    if (images->has_alpha_) return 1;
+    assert(images->img_ != NULL);
+    if (images->img_->tag_ == kChunks[IDX_VP8L].tag) {
+      return 1;
+    }
    images = images->next_;
  }
  return 0;
@@ -431,7 +440,25 @@ WebPChunk** MuxGetChunkListFromId(const WebPMux* mux, WebPChunkId id) {
    case WEBP_CHUNK_ANIM:    return (WebPChunk**)&mux->anim_;
    case WEBP_CHUNK_EXIF:    return (WebPChunk**)&mux->exif_;
    case WEBP_CHUNK_XMP:     return (WebPChunk**)&mux->xmp_;
-    default:                 return (WebPChunk**)&mux->unknown_;
+    case WEBP_CHUNK_UNKNOWN: return (WebPChunk**)&mux->unknown_;
+    default: return NULL;
+  }
+}
+
+WebPMuxError MuxValidateForImage(const WebPMux* const mux) {
+  const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
+  const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_ANMF);
+  const int num_fragments = MuxImageCount(mux->images_, WEBP_CHUNK_FRGM);
+
+  if (num_images == 0) {
+    // No images in mux.
+    return WEBP_MUX_NOT_FOUND;
+  } else if (num_images == 1 && num_frames == 0 && num_fragments == 0) {
+    // Valid case (single image).
+    return WEBP_MUX_OK;
+  } else {
+    // Frame/Fragment case OR an invalid mux.
+    return WEBP_MUX_INVALID_ARGUMENT;
  }
 }

@@ -447,7 +474,7 @@ static int IsNotCompatible(int feature, int num_items) {
 // On success returns WEBP_MUX_OK and stores the chunk count in *num.
 static WebPMuxError ValidateChunk(const WebPMux* const mux, CHUNK_INDEX idx,
                                  WebPFeatureFlags feature,
-                                  uint32_t vp8x_flags,
+                                  WebPFeatureFlags vp8x_flags,
                                  int max, int* num) {
  const WebPMuxError err =
      WebPMuxNumChunks(mux, kChunks[idx].id, num);
@@ -523,18 +550,14 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
  if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;

  // ALPHA_FLAG & alpha chunk(s) are consistent.
-  if (MuxHasAlpha(mux->images_)) {
+  if (MuxHasLosslessImages(mux->images_)) {
    if (num_vp8x > 0) {
-      // VP8X chunk is present, so it should contain ALPHA_FLAG.
+      // Special case: we have a VP8X chunk as well as some lossless images.
      if (!(flags & ALPHA_FLAG)) return WEBP_MUX_INVALID_ARGUMENT;
-    } else {
-      // VP8X chunk is not present, so ALPH chunks should NOT be present either.
-      err = WebPMuxNumChunks(mux, WEBP_CHUNK_ALPHA, &num_alpha);
-      if (err != WEBP_MUX_OK) return err;
-      if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
    }
-  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
-    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
+  } else {
+      err = ValidateChunk(mux, IDX_ALPHA, ALPHA_FLAG, flags, -1, &num_alpha);
+      if (err != WEBP_MUX_OK) return err;
  }

  // num_fragments & num_images are consistent.
@@ -549,3 +572,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/mux/muxread.c
+++ b/src/mux/muxread.c
@@ -16,6 +16,10 @@
 #include "./muxi.h"
 #include "../utils/utils.h"

+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
 //------------------------------------------------------------------------------
 // Helper method(s).

@@ -72,29 +76,6 @@ static WebPMuxError ChunkVerifyAndAssign(WebPChunk* chunk,
  return ChunkAssignData(chunk, &chunk_data, copy_data, GetLE32(data + 0));
 }

-int MuxImageFinalize(WebPMuxImage* const wpi) {
-  const WebPChunk* const img = wpi->img_;
-  const WebPData* const image = &img->data_;
-  const int is_lossless = (img->tag_ == kChunks[IDX_VP8L].tag);
-  int w, h;
-  int vp8l_has_alpha = 0;
-  const int ok = is_lossless ?
-      VP8LGetInfo(image->bytes, image->size, &w, &h, &vp8l_has_alpha) :
-      VP8GetInfo(image->bytes, image->size, image->size, &w, &h);
-  assert(img != NULL);
-  if (ok) {
-    // Ignore ALPH chunk accompanying VP8L.
-    if (is_lossless && (wpi->alpha_ != NULL)) {
-      ChunkDelete(wpi->alpha_);
-      wpi->alpha_ = NULL;
-    }
-    wpi->width_ = w;
-    wpi->height_ = h;
-    wpi->has_alpha_ = vp8l_has_alpha || (wpi->alpha_ != NULL);
-  }
-  return ok;
-}
-
 static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
                         WebPMuxImage* const wpi) {
  const uint8_t* bytes = chunk->data_.bytes;
@@ -140,14 +121,8 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
        break;
      case WEBP_CHUNK_IMAGE:
        if (ChunkSetNth(&subchunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Fail;
-        if (!MuxImageFinalize(wpi)) goto Fail;
        wpi->is_partial_ = 0;  // wpi is completely filled.
        break;
-      case WEBP_CHUNK_UNKNOWN:
-        if (wpi->is_partial_) goto Fail;  // Encountered an unknown chunk
-                                          // before some image chunks.
-        if (ChunkSetNth(&subchunk, &wpi->unknown_, 0) != WEBP_MUX_OK) goto Fail;
-        break;
      default:
        goto Fail;
        break;
@@ -243,7 +218,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
        break;
      case WEBP_CHUNK_IMAGE:
        if (ChunkSetNth(&chunk, &wpi->img_, 1) != WEBP_MUX_OK) goto Err;
-        if (!MuxImageFinalize(wpi)) goto Err;
        wpi->is_partial_ = 0;  // wpi is completely filled.
 PushImage:
        // Add this to mux->images_ list.
@@ -263,6 +237,7 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
        if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
                                         // getting all chunks of an image.
        chunk_list = MuxGetChunkListFromId(mux, id);  // List to add this chunk.
+        if (chunk_list == NULL) chunk_list = &mux->unknown_;
        if (ChunkSetNth(&chunk, chunk_list, 0) != WEBP_MUX_OK) goto Err;
        break;
    }
@@ -287,68 +262,35 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
 //------------------------------------------------------------------------------
 // Get API(s).

-// Validates that the given mux has a single image.
-static WebPMuxError ValidateForSingleImage(const WebPMux* const mux) {
-  const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
-  const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_ANMF);
-  const int num_fragments = MuxImageCount(mux->images_, WEBP_CHUNK_FRGM);
-
-  if (num_images == 0) {
-    // No images in mux.
-    return WEBP_MUX_NOT_FOUND;
-  } else if (num_images == 1 && num_frames == 0 && num_fragments == 0) {
-    // Valid case (single image).
-    return WEBP_MUX_OK;
-  } else {
-    // Frame/Fragment case OR an invalid mux.
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-}
-
-// Get the canvas width, height and flags after validating that VP8X/VP8/VP8L
-// chunk and canvas size are valid.
-static WebPMuxError MuxGetCanvasInfo(const WebPMux* const mux,
-                                     int* width, int* height, uint32_t* flags) {
-  int w, h;
-  uint32_t f = 0;
+WebPMuxError WebPMuxGetFeatures(const WebPMux* mux, uint32_t* flags) {
  WebPData data;
-  assert(mux != NULL);
+
+  if (mux == NULL || flags == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  *flags = 0;

  // Check if VP8X chunk is present.
  if (MuxGet(mux, IDX_VP8X, 1, &data) == WEBP_MUX_OK) {
-    if (data.size < VP8X_CHUNK_SIZE) return WEBP_MUX_BAD_DATA;
-    f = GetLE32(data.bytes + 0);
-    w = GetLE24(data.bytes + 4) + 1;
-    h = GetLE24(data.bytes + 7) + 1;
-  } else {  // Single image case.
-    const WebPMuxImage* const wpi = mux->images_;
-    WebPMuxError err = ValidateForSingleImage(mux);
+    if (data.size < CHUNK_SIZE_BYTES) return WEBP_MUX_BAD_DATA;
+    *flags = GetLE32(data.bytes);  // All OK. Fill up flags.
+  } else {
+    WebPMuxError err = MuxValidateForImage(mux);  // Check for single image.
    if (err != WEBP_MUX_OK) return err;
-    assert(wpi != NULL);
-    w = wpi->width_;
-    h = wpi->height_;
-    if (wpi->has_alpha_) f |= ALPHA_FLAG;
+    if (MuxHasLosslessImages(mux->images_)) {
+      const WebPData* const vp8l_data = &mux->images_->img_->data_;
+      int has_alpha = 0;
+      if (!VP8LGetInfo(vp8l_data->bytes, vp8l_data->size, NULL, NULL,
+                       &has_alpha)) {
+        return WEBP_MUX_BAD_DATA;
+      }
+      if (has_alpha) {
+        *flags = ALPHA_FLAG;
+      }
+    }
  }
-  if (w * (uint64_t)h >= MAX_IMAGE_AREA) return WEBP_MUX_BAD_DATA;

-  if (width != NULL) *width = w;
-  if (height != NULL) *height = h;
-  if (flags != NULL) *flags = f;
  return WEBP_MUX_OK;
 }

-WebPMuxError WebPMuxGetCanvasSize(const WebPMux* mux, int* width, int* height) {
-  if (mux == NULL || width == NULL || height == NULL) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-  return MuxGetCanvasInfo(mux, width, height, NULL);
-}
-
-WebPMuxError WebPMuxGetFeatures(const WebPMux* mux, uint32_t* flags) {
-  if (mux == NULL || flags == NULL) return WEBP_MUX_INVALID_ARGUMENT;
-  return MuxGetCanvasInfo(mux, NULL, NULL, flags);
-}
-
 static uint8_t* EmitVP8XChunk(uint8_t* const dst, int width,
                              int height, uint32_t flags) {
  const size_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
@@ -382,7 +324,15 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
  dst = MuxEmitRiffHeader(data, size);

  if (need_vp8x) {
-    dst = EmitVP8XChunk(dst, wpi->width_, wpi->height_, ALPHA_FLAG);  // VP8X.
+    int w, h;
+    WebPMuxError err;
+    assert(wpi->img_ != NULL);
+    err = MuxGetImageWidthHeight(wpi->img_, &w, &h);
+    if (err != WEBP_MUX_OK) {
+      free(data);
+      return err;
+    }
+    dst = EmitVP8XChunk(dst, w, h, ALPHA_FLAG);  // VP8X.
    dst = ChunkListEmit(wpi->alpha_, dst);       // ALPH.
  }

@@ -422,8 +372,6 @@ static WebPMuxError MuxGetImageInternal(const WebPMuxImage* const wpi,
  info->x_offset = 0;
  info->y_offset = 0;
  info->duration = 1;
-  info->dispose_method = WEBP_MUX_DISPOSE_NONE;
-  info->blend_method = WEBP_MUX_BLEND;
  // Extract data for related fields.
  info->id = ChunkGetIdFromTag(wpi->img_->tag_);
  return SynthesizeBitstream(wpi, &info->bitstream);
@@ -444,17 +392,10 @@ static WebPMuxError MuxGetFrameFragmentInternal(const WebPMuxImage* const wpi,
  // Extract info.
  frame->x_offset = 2 * GetLE24(frame_frgm_data->bytes + 0);
  frame->y_offset = 2 * GetLE24(frame_frgm_data->bytes + 3);
-  if (is_frame) {
-    const uint8_t bits = frame_frgm_data->bytes[15];
-    frame->duration = GetLE24(frame_frgm_data->bytes + 12);
-    frame->dispose_method =
-        (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
-  } else {  // Defaults for unused values.
-    frame->duration = 1;
-    frame->dispose_method = WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = WEBP_MUX_BLEND;
-  }
+  frame->duration = is_frame ? GetLE24(frame_frgm_data->bytes + 12) : 1;
+  frame->dispose_method =
+      is_frame ? (WebPMuxAnimDispose)(frame_frgm_data->bytes[15] & 1)
+               : WEBP_MUX_DISPOSE_NONE;
  frame->id = ChunkGetIdFromTag(wpi->header_->tag_);
  return SynthesizeBitstream(wpi, &frame->bitstream);
 }
@@ -501,7 +442,7 @@ WebPMuxError WebPMuxGetAnimationParams(const WebPMux* mux,
 static CHUNK_INDEX ChunkGetIndexFromId(WebPChunkId id) {
  int i;
  for (i = 0; kChunks[i].id != WEBP_CHUNK_NIL; ++i) {
-    if (id == kChunks[i].id) return (CHUNK_INDEX)i;
+    if (id == kChunks[i].id) return i;
  }
  return IDX_NIL;
 }
@@ -529,8 +470,12 @@ WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
    *num_elements = MuxImageCount(mux->images_, id);
  } else {
    WebPChunk* const* chunk_list = MuxGetChunkListFromId(mux, id);
-    const CHUNK_INDEX idx = ChunkGetIndexFromId(id);
-    *num_elements = CountChunks(*chunk_list, kChunks[idx].tag);
+    if (chunk_list == NULL) {
+      *num_elements = 0;
+    } else {
+      const CHUNK_INDEX idx = ChunkGetIndexFromId(id);
+      *num_elements = CountChunks(*chunk_list, kChunks[idx].tag);
+    }
  }

  return WEBP_MUX_OK;
@@ -538,3 +483,6 @@ WebPMuxError WebPMuxNumChunks(const WebPMux* mux,

 //------------------------------------------------------------------------------

+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/utils/Makefile.am
+++ b/src/utils/Makefile.am
@@ -9,8 +9,6 @@ common_HEADERS = ../webp/types.h
 commondir = $(includedir)/webp

 COMMON_SOURCES =
-COMMON_SOURCES += alpha_processing.c
-COMMON_SOURCES += alpha_processing.h
 COMMON_SOURCES += bit_reader.c
 COMMON_SOURCES += bit_reader.h
 COMMON_SOURCES += color_cache.c
@@ -23,8 +21,6 @@ COMMON_SOURCES += quant_levels_dec.c
 COMMON_SOURCES += quant_levels_dec.h
 COMMON_SOURCES += rescaler.c
 COMMON_SOURCES += rescaler.h
-COMMON_SOURCES += random.c
-COMMON_SOURCES += random.h
 COMMON_SOURCES += thread.c
 COMMON_SOURCES += thread.h
 COMMON_SOURCES += utils.c
--- a/src/utils/alpha_processing.c
+++ b/src/utils/alpha_processing.c
@@ -1,196 +0,0 @@
-// Copyright 2013 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Utilities for processing transparent channel.
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <assert.h>
-#include "./alpha_processing.h"
-
-// Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
-
-// -----------------------------------------------------------------------------
-
-#define MFIX 24    // 24bit fixed-point arithmetic
-#define HALF ((1u << MFIX) >> 1)
-#define KINV_255 ((1u << MFIX) / 255u)
-
-static uint32_t Mult(uint8_t x, uint32_t mult) {
-  const uint32_t v = (x * mult + HALF) >> MFIX;
-  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
-  return v;
-}
-
-#ifdef USE_TABLES_FOR_ALPHA_MULT
-
-static const uint32_t kMultTables[2][256] = {
-  {    // (255u << MFIX) / alpha
-    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
-    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
-    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
-    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
-    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
-    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
-    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
-    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
-    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
-    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
-    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
-    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
-    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
-    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
-    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
-    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
-    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
-    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
-    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
-    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
-    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
-    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
-    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
-    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
-    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
-    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
-    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
-    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
-    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
-    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
-    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
-    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
-    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
-    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
-    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
-    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
-    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
-    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
-    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
-    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
-    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
-    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
-    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
-  {   // alpha * KINV_255
-    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
-    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
-    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
-    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
-    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
-    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
-    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
-    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
-    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
-    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
-    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
-    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
-    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
-    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
-    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
-    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
-    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
-    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
-    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
-    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
-    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
-    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
-    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
-    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
-    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
-    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
-    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
-    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
-    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
-    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
-    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
-    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
-    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
-    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
-    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
-    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
-    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
-    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
-    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
-    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
-    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
-    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
-    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
-};
-
-static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
-  return kMultTables[!inverse][a];
-}
-
-#else
-
-static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
-  return inverse ? (255u << MFIX) / a : a * KINV_255;
-}
-
-#endif    // USE_TABLES_FOR_ALPHA_MULT
-
-void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    const uint32_t argb = ptr[x];
-    if (argb < 0xff000000u) {      // alpha < 255
-      if (argb <= 0x00ffffffu) {   // alpha == 0
-        ptr[x] = 0;
-      } else {
-        const uint32_t alpha = (argb >> 24) & 0xff;
-        const uint32_t scale = GetScale(alpha, inverse);
-        uint32_t out = argb & 0xff000000u;
-        out |= Mult(argb >>  0, scale) <<  0;
-        out |= Mult(argb >>  8, scale) <<  8;
-        out |= Mult(argb >> 16, scale) << 16;
-        ptr[x] = out;
-      }
-    }
-  }
-}
-
-void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
-                      int inverse) {
-  int n;
-  for (n = 0; n < num_rows; ++n) {
-    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
-    ptr += stride;
-  }
-}
-
-void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                 int width, int inverse) {
-  int x;
-  for (x = 0; x < width; ++x) {
-    const uint32_t a = alpha[x];
-    if (a != 255) {
-      if (a == 0) {
-        ptr[x] = 0;
-      } else {
-        const uint32_t scale = GetScale(a, inverse);
-        ptr[x] = Mult(ptr[x], scale);
-      }
-    }
-  }
-}
-
-void WebPMultRows(uint8_t* ptr, int stride,
-                  const uint8_t* alpha, int alpha_stride,
-                  int width, int num_rows, int inverse) {
-  int n;
-  for (n = 0; n < num_rows; ++n) {
-    WebPMultRow(ptr, alpha, width, inverse);
-    ptr += stride;
-    alpha += alpha_stride;
-  }
-}
-
-#undef KINV_255
-#undef HALF
-#undef MFIX
-
--- a/Show More
+++ b/Show More