update ChangeLog

Change-Id: I3bd7347337323f727a979dccadddfd22f56ec1c2
Harmonize the dates
2011-09-22 13:51:57 -07:00 · 2011-09-22 13:50:10 -07:00 · 2011-09-20 07:59:43 -07:00 · 2011-09-16 14:33:38 -07:00 · 2011-09-16 12:32:03 -07:00 · 2011-09-16 10:55:24 -07:00
77 changed files with 11561 additions and 1818 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+.gitattributes export-ignore
+.gitignore export-ignore
+.mailmap export-ignore
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 /config.*
 /configure
 /depcomp
+/dist
 /install-sh
 /libtool
 /ltmain.sh
@@ -16,3 +17,6 @@
 Makefile
 Makefile.in
 examples/[cd]webp
+/output
+*.idb
+*.pdb
--- a/.mailmap
+++ b/.mailmap
@@ -0,0 +1,2 @@
+Mikołaj Zalewski <mikolajz@google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
--- a/6
+++ b/6
@@ -1,7 +1,9 @@
 Contributors:
+- James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Mikołaj Zalewski (mikolajz at google dot com)
 - Pascal Massimino (pascal dot massimino at gmail dot com)
- pierre.php@gmail.com
- Somnath Banerjee (somnath at google dot com)
+- Pierre Joye (pierre dot php at gmail dot com)
+- Somnath Banerjee (somnath dot banerjee at gmail dot com)
+- Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
--- a/Android.mk
+++ b/Android.mk
@@ -2,29 +2,41 @@ LOCAL_PATH:= $(call my-dir)

 include $(CLEAR_VARS)
 LOCAL_SRC_FILES := \
-	src/dec/bits.c \
+	src/dec/alpha.c \
 	src/dec/dsp.c \
 	src/dec/frame.c \
 	src/dec/idec.c \
+	src/dec/layer.c \
 	src/dec/quant.c \
 	src/dec/tree.c \
 	src/dec/vp8.c \
 	src/dec/webp.c \
-	src/dec/yuv.c \
+	src/dec/io.c \
+	src/dec/buffer.c \
+	src/dsp/yuv.c \
+	src/dsp/upsampling.c \
+	src/dsp/cpu.c \
+	src/dsp/dec.c \
+	src/dsp/dec_neon.c \
+	src/dsp/enc.c \
+	src/enc/alpha.c \
 	src/enc/analysis.c \
-	src/enc/bit_writer.c \
 	src/enc/config.c \
 	src/enc/dsp.c \
 	src/enc/filter.c \
 	src/enc/frame.c \
 	src/enc/iterator.c \
+	src/enc/layer.c \
 	src/enc/picture.c \
 	src/enc/quant.c \
 	src/enc/syntax.c \
 	src/enc/tree.c \
 	src/enc/webpenc.c
+	src/utils/bit_reader.c \
+	src/utils/bit_writer.c \
+	src/utils/thread.c \

-LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD \
+LOCAL_CFLAGS := -Wall -DANDROID -DHAVE_MALLOC_H -DHAVE_PTHREAD -DWEBP_USE_THREAD \
                -finline-functions -frename-registers -ffast-math \
                -s -fomit-frame-pointer -Isrc/webp

--- a/142
+++ b/142
@@ -1,4 +1,144 @@
-d3dc2d4 update ChangeLog
+dfc9c1e Harmonize the dates
+28ad70c Fix PNG decoding bug
+846e93c Update AUTHORS & add .mailmap
+563e52d cosmetics after '76036f5 Refactor decoder library'
+76036f5 Refactor decoder library
+377ef43 configure.ac: update AC_INIT params
+7a8d876 use a user-visible MACRO for max width/height.
+d4e9f55 NEON decode support in WebP
+0ee683b update libtool version-info
+fdbe02c windows: match _cond_destroy logic w/return variable name
+206b686 README: correct advanced decode api pseudo-code
+6a32a0f make VP8BitReader a typedef, for better re-use
+b112e83 create a libwebputils under src/utils
+ee697d9 harmonize the include guards and #endif comments
+a1ec07a Fixing compiler error in non x86 arch.
+dcfa509 Fixed recursive inclusion of bit_writer.h and vp8enci.h.
+e06ac08 create a separate libwebpdsp under src/dsp
+ebeb412 use unsigned int for bitfields
+341cc56 make kNewRange a static array
+227a91e README: minor wording update
+05bd8e6 add man pages to dist
+812dfa1 bump up versions in preparations for 0.1.3
+a5b78c8 wrap alpha-related options under WEBP_EXPERIMENTAL_FEATURES flag
+34dc790 regen ChangeLog for 0.1.3-rc2
+7c43663 Silence some (more) Visual Studio warnings.
+60306e8 add top-level gitattributes
+2aa6b80 Slience some Visual Studio warnings.
+4cbbb29 Merge "bump up version for next freeze"
+a329167 bump up version for next freeze
+c7e86ab cosmetics: fix comment line lengths
+c9e037a makefile.unix: add simple dist target
+87d58ce makefile.unix: rule maintenance
+d477de7 mend
+fac15ec Update NEWS & README for next release V0.1.3
+6215595 Merge "add a -partition_limit option to limit the number of bits used by intra4x4"
+3814b76 Merge "reorganize chunk-parsing code"
+900286e add a -partition_limit option to limit the number of bits used by intra4x4
+cd12b4b add the missing cost for I4/I16 mode selection
+dfcc213 reorganize chunk-parsing code
+3cf2030 initialize pointers to function within VP8DspInit()
+d21b479 Merge "windows: add decode threading support"
+473ae95 fix hang on thread creation failure
+fccca42 windows: add decode threading support
+a31f843 Use the exact PNG_INCLUDES/PNG_LIBS when testing for -lpng
+ad9b45f Merge "Makefile.vc: rule maintenance"
+565a2ca Makefile.vc: rule maintenance
+2d0da68 makefile.unix: disable Wvla by default
+fc7815d multi-thread decoding: ~25-30% faster
+acd8ba4 io->teardown() was not always called upon error
+c85527b Merge "Makefile.vc: add DLL configs"
+e1e9be3 cosmetics: spelling/grammar in README and lib headers
+b4d0ef8 Makefile.vc: add DLL configs
+998754a remove unused nb_i4_ and nb_i16_ fields.
+9f01ce3 rename WebPDecBuffer::memory -> private_memory
+fb5d659 fix an overflow bug in LUT calculation
+d646d5c swig: add WebPDecodeARGB
+78aeed4 add missing WebPDecodeARGBInto() and switch ARGB4444 to RGBA4444 as was intended
+cd7c529 explicitly mark library functions as extern
+19db59f add support for RGB565, ARGB4444 and ARGB colorspace (decoder)
+c915fb2 encoder speed-up: hardcode special level values
+c558bda Rename and improve the API to retrieve decoded area
+bf599d7 Merge "makefile.unix: disable -Wvla by default"
+c9ea03d SSE2 version of strong filtering
+993af3e makefile.unix: disable -Wvla by default
+3827e1b Merge "examples: (windows/WIC) add alpha support"
+e291fae SSE2 functions for the fancy upsampler.
+a06bbe2 add WebPISetIOHooks() to set some custom hooks on the incremental decoder object.
+7643a6f Merge "makefile.unix: use uname to detect OSX environment"
+5142a0b export alpha channel (if present) when dumping to PGM format
+14d5731 makefile.unix: use uname to detect OSX environment
+0805706 examples: quiet warnings
+3cfe088 examples: (windows/WIC) add alpha support
+13ed94b add compile warning for variable-length-array
+5a18eb1 Merge "add Advanced Decoding Interface"
+5c4f27f add missing \n
+f4c4e41 80 cols fix
+d260310 add Advanced Decoding Interface
+bd2f65f sse2 version of the complex filter
+96ed9ce perform two idct transforms at a time when possible
+01af7b6 use aligned stored
+0e1d1fd Merge "Makefile.vc: add experimental target"
+2a1292a Makefile.vc: add experimental target
+23bf351 Enable decode SSE2 for Visual Studio
+131a4b7 dec/dsp_sse2: fix visual studio compile
+00d9d68 swig: file reorganization
+7fc7e0d Merge "swig/java: basic encode support"
+3be57b1 fix MSVC compile for WEBP_EXPERIMENTAL_FEATURES
+40a7e34 dec/dsp: disable sse2 for Visual Studio builds
+e4d540c add SSE2 code for transform
+54f2170 swig/java: basic encode support
+c5d4584 call function pointers instead of C-version
+ea43f04 Merge "configure: mingw32 targets: test for WIC support"
+a11009d SSE2 version of simple in-loop filtering
+42548da shave one unneeded filter-cache line
+31f9dc6 configure: mingw32 targets: test for WIC support
+1955969 Merge "split expression in two."
+415dbe4 split expression in two.
+e29072a configure: test for zlib only w/--enable-experimental
+b2b0090 Simplify Visual Studio ifdefs
+ca7a2fd Add error reporting from encoding failures.
+6c9405d Merge "Makefile.vc: require CFG with clean target"
+0424ecd Makefile.vc: require CFG with clean target
+003417c Enable SSE2 for Visual Studio builds
+af10db4 little speed up for VP8BitUpdate()
+e71418f more MSVC files to ignore
+46d9036 cosmetics
+edf59ab typo fix
+72229f5 Add support for x64 and SSE2 builds under Windows.
+92e5c6e VP8GetInfo() + WebPResetDecParams()
+416b7a6 raise the fixed-point precision for the rescaler
+aa87e4e fix alignment
+eb66670 disable WEBP_EXPERIMENTAL_FEATURES
+c5ae7f6 typo fix: USE_ => WEBP_
+d041efa swig: add libwebp.jar/libwebp_java_wrap.c
+f6fb387 add swig interface
+e927390 align buffer for double too
+842c009 fix -strong option
+d0a7038 Merge "cosmetics"
+fc0a02e fix the dichotomy loop
+38369c0 cosmetics
+8dfc4c6 factorize and unify GetAlpha() between the C and SSE2 version
+6d0e66c prepare experimentation with yuv444 / 422
+79cc49f add a --enable-experimental option to './configure'
+d757523 sse2 version of CollectHistogram()
+c1c728d add an extra #ifdef WEBP_EXPERIMENTAL_FEATURES to avoid 'unused variable' warning
+60c61d2 always call VP*EncDeleteAlpha() unconditionnally, for simplicity
+0f8c638 simply don't call WriteExtensions() if WEBP_EXPERIMENTAL_FEATURES is not defined
+47c661d rename swap -> swap_rb
+10d55bb move chunk[] declaration out of the for() loop
+517cec2 fix indentation
+f7d9e26 fix merge problems
+8fd42b3 add a stride 'a_stride' for the alpha plane
+b8dcbf2 fix alpha-plane copy and crop methods
+cdef89d fix some 'unused variable' warning
+fb29c26 SSE2 version of the fwd transform and the squared sum metric
+2ab4b72 EXPERIMENTAL: add support for alpha channel
+cfbf88a add SSE2 functions. ~2x faster encoding on average.
+e7ff3f9 merge two ITransforms together when applicable and change the TTransform to return the sum directly.
+ca55413 fix WebPIDecGetRGB() to accept any RGB(A) mode, not just MODE_RGB
+8aa50ef fix some 'man' typos
+d3f3bdd update ChangeLog (v0.1.2)
 d7e9a69 update contributor list
 261abb8 add a 'superclean' section
 276ae82 Remove files not mean to be in git, and update .gitignore
--- a/Makefile.vc
+++ b/Makefile.vc
@@ -7,15 +7,26 @@ LIB_NAME_DEBUG = libwebp_a_debug
 #
 # Stem for DLL import libs
 #
-IMPLIB_NAME       = libwebp
-IMPLIB_NAME_DEBUG = libwepb_debug
+IMPLIB_NAME       = libwebp_dll
+IMPLIB_NAME_DEBUG = libwebp_dll_debug

 !IFNDEF DEP_PATH
 DEPS_PATH   = ../../deps
 !ENDIF

 !IFNDEF ARCH
-ARCH  = x86
+!IF ! [ cl 2>&1 | find "x86" > NUL ]
+ARCH = x86
+!ELSE IF ! [ cl 2>&1 | find "x64" > NUL ]
+ARCH = x64
+!ELSE
+!ERROR Unable to auto-detect toolchain architecture! \
+If cl.exe is in your PATH rerun nmake with ARCH=<arch>.
+!ENDIF
+!ENDIF
+
+!IF "$(ARCH)" == "x86"
+PLATFORM_LDFLAGS = /SAFESEH
 !ENDIF

 #############################################################
@@ -24,14 +35,14 @@ ARCH  = x86
 MT         = mt.exe
 CCNODBG    = cl.exe /nologo /O2 /DNDEBUG
 CCDEBUG    = cl.exe /nologo /Od /Gm /Zi /D_DEBUG /RTC1
-CFLAGS     = /Isrc /nologo /W3 /EHsc /DWIN32 /FD /c /GS /D_CRT_SECURE_NO_WARNINGS
-LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /SAFESEH /DYNAMICBASE
-CFLAGSLIB  = /DLIBWEBP_STATICLIB
+CFLAGS     = /Isrc /nologo /W3 /EHsc /FD /c /GS
+CFLAGS     = $(CFLAGS) /DWIN32 /D_CRT_SECURE_NO_WARNINGS /DWIN32_LEAN_AND_MEAN
+CFLAGS     = $(CFLAGS) /DHAVE_WINCODEC_H /DWEBP_USE_THREAD
+LDFLAGS    = /LARGEADDRESSAWARE /MANIFEST /NXCOMPAT /DYNAMICBASE $(PLATFORM_LDFLAGS)
 LNKDLL     = link.exe /DLL
 LNKLIB     = link.exe /lib
 LNKEXE     = link.exe
 LFLAGS     = /nologo /machine:$(ARCH)
-CFLAGS     = $(CFLAGS)

 CFGSET     = FALSE
 !IF "$(OBJDIR)" == ""
@@ -55,19 +66,35 @@ DIRLIB = $(DIRBASE)\lib
 DIRINC = $(DIRBASE)\include
 DIRBIN = $(DIRBASE)\bin

-# release-static
+# Target configuration
 !IF "$(CFG)" == "release-static"
-TARGET = $(LIB_NAME).lib
-LNK    = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
-CC     = $(CCNODBG) $(RTLIB) $(CFLAGSLIB)
-CFGSET = TRUE
+TARGET         = $(LIB_NAME).lib
+CC             = $(CCNODBG)
+STATICLIBBUILD = TRUE
+!ELSE IF "$(CFG)" == "debug-static"
+TARGET         = $(LIB_NAME_DEBUG).lib
+CC             = $(CCDEBUG)
+STATICLIBBUILD = TRUE
+!ELSE IF "$(CFG)" == "release-dynamic"
+TARGETDLL = $(LIB_NAME).dll
+TARGET    = $(IMPLIB_NAME).lib
+CC        = $(CCNODBG)
+DLLBUILD  = TRUE
+!ELSE IF "$(CFG)" == "debug-dynamic"
+TARGETDLL = $(LIB_NAME_DEBUG).dll
+TARGET    = $(IMPLIB_NAME_DEBUG).lib
+CC        = $(CCDEBUG)
+DLLBUILD  = TRUE
 !ENDIF

-# debug-static
-!IF "$(CFG)" == "debug-static"
-TARGET = $(LIB_NAME_DEBUG).lib
+!IF "$(STATICLIBBUILD)" == "TRUE"
+CC     = $(CC) $(RTLIB)
 LNK    = $(LNKLIB) /out:$(DIRLIB)\$(TARGET)
-CC     = $(CCDEBUG) $(RTLIBD) $(CFLAGSLIB)
+CFGSET = TRUE
+!ELSE IF "$(DLLBUILD)" == "TRUE"
+DLLINC = webp_dll.h
+CC     = $(CC) /I$(DIROBJ) /FI$(DLLINC) $(RTLIB) /DWEBP_DLL
+LNK    = $(LNKDLL) /out:$(DIRBIN)\$(TARGETDLL) /implib:$(DIRLIB)\$(TARGET)
 CFGSET = TRUE
 !ENDIF

@@ -75,13 +102,18 @@ CFGSET = TRUE
 # Usage
 #
 !IF "$(CFGSET)" == "FALSE"
-!MESSAGE Usage: nmake /f makefile.vc9 [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
+!MESSAGE Usage: nmake /f Makefile.vc [CFG=<config>] [OBJDIR=<path>] [RTLIBCFG=<rtlib>] [<target>]
 !MESSAGE where <config> is one of:
 !MESSAGE -  release-static                - release static library
 !MESSAGE -  debug-static                  - debug static library
-!MESSAGE -  (empty)                       - perform a clean
+!MESSAGE -  release-dynamic               - release dynamic link library (DLL)
+!MESSAGE -  debug-dynamic                 - debug dynamic link library (DLL)
+!MESSAGE <target> may be:
+!MESSAGE -  clean                         - perform a clean for CFG
+!MESSAGE -  experimental                  - build CFG with experimental
+!MESSAGE .                                  features enabled. Requires zlib.
 !MESSAGE
-!MESSAGE <rtlibcfg> controls the runtime library likage - can be 'static' or 'dynamic'.
+!MESSAGE <rtlibcfg> controls the runtime library linkage - can be 'static' or 'dynamic'.
 !MESSAGE <target> can be left blank in which case all is assumed
 !MESSAGE <path> is the path where you like to build (obj, bins, etc.)
 !MESSAGE   default to ..\obj\
@@ -93,39 +125,26 @@ CFGSET = TRUE
 !ENDIF

 #######################
-# Only the clean target can be used if a config was not provided.
+# Rules
 #
-!IF "$(CFGSET)" == "FALSE"
-!MESSAGE
-!MESSAGE No configuration provided - performing a clean.
-clean:
-	@-erase /s *.dll 2> NUL
-	@-erase /s *.exp 2> NUL
-	@-erase /s *.idb 2> NUL
-	@-erase /s *.lib 2> NUL
-	@-erase /s *.obj 2> NUL
-	@-erase /s *.pch 2> NUL
-	@-erase /s *.pdb 2> NUL
-	@-erase /s *.res 2> NUL
-!ELSE
+!IF "$(CFGSET)" == "TRUE"
 # A config was provided, so the library can be built.
 #

 X_OBJS= \
-	$(DIROBJ)\dec\bits.obj \
-	$(DIROBJ)\dec\dsp.obj \
 	$(DIROBJ)\dec\frame.obj \
 	$(DIROBJ)\dec\quant.obj \
 	$(DIROBJ)\dec\tree.obj \
 	$(DIROBJ)\dec\vp8.obj \
 	$(DIROBJ)\dec\webp.obj \
-	$(DIROBJ)\dec\yuv.obj \
+	$(DIROBJ)\dec\io.obj \
+	$(DIROBJ)\dec\buffer.obj \
 	$(DIROBJ)\dec\idec.obj \
+	$(DIROBJ)\dec\alpha.obj \
+	$(DIROBJ)\dec\layer.obj \
 	$(DIROBJ)\enc\analysis.obj \
-	$(DIROBJ)\enc\bit_writer.obj \
 	$(DIROBJ)\enc\config.obj \
 	$(DIROBJ)\enc\cost.obj \
-	$(DIROBJ)\enc\dsp.obj \
 	$(DIROBJ)\enc\frame.obj \
 	$(DIROBJ)\enc\filter.obj \
 	$(DIROBJ)\enc\iterator.obj \
@@ -134,6 +153,19 @@ X_OBJS= \
 	$(DIROBJ)\enc\syntax.obj \
 	$(DIROBJ)\enc\tree.obj \
 	$(DIROBJ)\enc\webpenc.obj \
+	$(DIROBJ)\enc\alpha.obj \
+	$(DIROBJ)\enc\layer.obj \
+	$(DIROBJ)\dsp\enc.obj \
+	$(DIROBJ)\dsp\enc_sse2.obj \
+	$(DIROBJ)\dsp\upsampling.obj \
+	$(DIROBJ)\dsp\upsampling_sse2.obj \
+	$(DIROBJ)\dsp\dec.obj \
+	$(DIROBJ)\dsp\dec_sse2.obj \
+	$(DIROBJ)\dsp\cpu.obj \
+	$(DIROBJ)\dsp\yuv.obj \
+	$(DIROBJ)\utils\bit_reader.obj \
+	$(DIROBJ)\utils\bit_writer.obj \
+	$(DIROBJ)\utils\thread.obj \
 	$(RESOURCE)

 EXAMPLES_OBJS = \
@@ -142,18 +174,25 @@ EXAMPLES_OBJS = \

 all: $(DIRLIB)\$(TARGET) $(DIRBIN)\dwebp.exe $(DIRBIN)\cwebp.exe

+# Additional include and library paths (for zlib) can be passed via the CL and
+# LINK environment variables respectively:
+# > set CL=/I\zlib\include
+# > set LINK=\zlib\zlib.lib
+# > nmake /f Makefile.vc CFG=release-static experimental
+experimental:
+	$(MAKE) /f Makefile.vc \
+	    CFG=$(CFG) CFLAGS="$(CFLAGS) /DWEBP_EXPERIMENTAL_FEATURES" /$(MAKEFLAGS)
+
 $(DIRLIB)\$(TARGET): $(X_OBJS)
 	$(LNK) $(LFLAGS) $(X_OBJS)
-	-xcopy $(DIROBJ)\$(LIB_NAME).dll       $(DIRBIN) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME).lib       $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).dll $(DIRBIN) /y
-	-xcopy $(DIROBJ)\$(LIB_NAME_DEBUG).lib $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(IMPLIB_NAME).lib    $(DIRLIB) /y
-	-xcopy $(DIROBJ)\$(IMPLIB_NAME_DEBUG).lib $(DIRLIB) /y
-	-xcopy $(DIROBJ)\*.exp                 $(DIRLIB) /y
-	-xcopy $(DIROBJ)\*.pdb                 $(DIRLIB) /y
+	-xcopy $(DIROBJ)\*.pdb $(DIRLIB) /y

-$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIRLIB) $(DIRINC) $(DIRBIN)
+$(X_OBJS): $(DIROBJ)\enc $(DIROBJ)\dec $(DIROBJ)\dsp $(DIROBJ)\utils $(DIRLIB) $(DIRINC) $(DIRBIN)
+!IF "$(DLLBUILD)" == "TRUE"
+$(X_OBJS): $(DIROBJ)\$(DLLINC)
+clean::
+	@-erase /s $(DIROBJ)\$(DLLINC) 2> NUL
+!ENDIF

 $(EXAMPLES_OBJS): $(DIROBJ)\examples $(DIRLIB)\$(TARGET)

@@ -166,6 +205,12 @@ $(DIROBJ)\examples:
 $(DIROBJ)\dec:
 	@if not exist "$(DIROBJ)\dec" mkdir $(DIROBJ)\dec

+$(DIROBJ)\dsp:
+	@if not exist "$(DIROBJ)\dsp" mkdir $(DIROBJ)\dsp
+
+$(DIROBJ)\utils:
+	@if not exist "$(DIROBJ)\utils" mkdir $(DIROBJ)\utils
+
 $(DIRLIB):
 	@if not exist "$(DIRLIB)" mkdir $(DIRLIB)

@@ -175,6 +220,13 @@ $(DIRINC):
 $(DIRBIN):
 	@if not exist "$(DIRBIN)" mkdir $(DIRBIN)

+# generate a helper include to define WEBP_EXTERN suitable for the DLL build
+$(DIROBJ)\$(DLLINC):
+	@echo #ifndef WEBP_DLL_H_ > $@
+	@echo #define WEBP_DLL_H_ >> $@
+	@echo #define WEBP_EXTERN(type) __declspec(dllexport) type >> $@
+	@echo #endif  /* WEBP_DLL_H_ */ >> $@
+
 .SUFFIXES: .c .obj .res .exe
 {examples}.c{$(DIROBJ)\examples}.obj:
 	$(CC) $(CFLAGS) /Fo"$@"  $<
@@ -182,10 +234,24 @@ $(DIRBIN):
 	$(CC) $(CFLAGS) /Fo"$@"  $<
 {src\enc}.c{$(DIROBJ)\enc}.obj:
 	$(CC) $(CFLAGS) /Fo"$@"  $<
+{src\dsp}.c{$(DIROBJ)\dsp}.obj:
+	$(CC) $(CFLAGS) /Fo"$@"  $<
+{src\utils}.c{$(DIROBJ)\utils}.obj:
+	$(CC) $(CFLAGS) /Fo"$@"  $<

 {$(DIROBJ)\examples}.obj{$(DIRBIN)}.exe:
 	$(LNKEXE) $(LDFLAGS) /OUT:"$@" $<  ole32.lib windowscodecs.lib shlwapi.lib $(DIRLIB)\$(TARGET)
 	$(MT) -manifest $@.manifest -outputresource:$@;1
 	del $@.manifest

+clean::
+	@-erase /s $(DIROBJ)\*.dll 2> NUL
+	@-erase /s $(DIROBJ)\*.exp 2> NUL
+	@-erase /s $(DIROBJ)\*.idb 2> NUL
+	@-erase /s $(DIROBJ)\*.lib 2> NUL
+	@-erase /s $(DIROBJ)\*.obj 2> NUL
+	@-erase /s $(DIROBJ)\*.pch 2> NUL
+	@-erase /s $(DIROBJ)\*.pdb 2> NUL
+	@-erase /s $(DIROBJ)\*.res 2> NUL
+
 !ENDIF  # End of case where a config was provided.
--- a/9
+++ b/9
@@ -1,3 +1,12 @@
+- 9/19/11: version 0.1.3
+  * Advanced decoding APIs.
+  * On-the-fly cropping and rescaling of images.
+  * SSE2 instructions for decoding performance optimizations on x86 based platforms.
+  * Support Multi-threaded decoding.
+  * 40% improvement in Decoding performance.
+  * Add support for RGB565, RGBA4444 & ARGB image colorspace.
+  * Better handling of large picture encoding.
+
 - 3/25/11: version 0.1.2
  * Incremental decoding: picture can be decoded byte-by-byte if needs be.
  * lot of bug-fixes, consolidation and stabilization
--- a/169
+++ b/169
@@ -4,12 +4,12 @@
          \__\__/\____/\_____/__/ ____  ___
                / _/ /    \    \ /  _ \/ _/
               /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.1.2
+               \____/____/\_____/_____/____/v0.1.3

 Description:
 ============

-WEBP codec: Library to encode and decode images in WebP format. This package
+WebP codec: library to encode and decode images in WebP format. This package
 contains the library that can be used in other programs to add WebP support,
 as well as the command line tools 'cwebp' and 'dwebp'.

@@ -32,9 +32,11 @@ By running:

  nmake /f Makefile.vc CFG=release-static RTLIBCFG=static OBJDIR=output

-the directory output\release-static\x86\bin will contain the tools
-cweb.exe and dweb.exe. The directory output\release-static\x86\lib will
-contains the libwebp static library.
+the directory output\release-static\(x64|x86)\bin will contain the tools
+cwebp.exe and dwebp.exe. The directory output\release-static\(x64|x86)\lib will
+contain the libwebp static library.
+The target architecture (x86/x64) is detected by Makefile.vc from the Visual
+Studio compiler (cl.exe) available in the system path.

 Unix build using makefile.unix:
 -------------------------------
@@ -56,9 +58,6 @@ Using autoconf tools:
 make
 make install

-Note: In case './configure' step fails, try generating configure & appropriate
-Makefile(s) via command 'aclocal && autoconf && automake -a -c;'.
-
 should be all you need to have the following files

 /usr/local/include/webp/decode.h
@@ -73,9 +72,37 @@ installed.

 Note: The encoding and decoding libraries are compiled separately
 (as src/dec/libwebpdecode.* and src/dec/libwebpencode.*). They
-can be installed independently using a minor modifications in the
+can be installed independently using a minor modification in the
 corresponding Makefile.am configure files (see comments there).

+SWIG bindings:
+--------------
+
+To generate language bindings from swig/libwebp.i swig-1.3
+(http://www.swig.org) is required. 2.0 may work, but has not been tested.
+
+Currently the following functions are mapped:
+Decode:
+  WebPGetDecoderVersion
+  WebPGetInfo
+  WebPDecodeRGB
+  WebPDecodeRGBA
+  WebPDecodeARGB
+  WebPDecodeBGR
+  WebPDecodeBGRA
+Encode:
+  WebPGetEncoderVersion
+  WebPEncodeRGB
+  WebPEncodeRGBA
+  WebPEncodeBGR
+  WebPEncodeBGRA
+
+Java bindings:
+
+To build the swig-generated JNI wrapper code at least JDK-1.5 (or equivalent)
+is necessary for enum support. The output is intended to be a shared object /
+DLL that can be loaded via System.loadLibrary("webp_jni").
+
 Encoding tool:
 ==============

@@ -84,7 +111,7 @@ decoding (dwebp) images.

 The easiest use should look like:
  cwebp input.png -q 80 -o output.webp
-which will convert the input PNG or JPEG file to a WebP one using a
+which will convert the input PNG or JPEG file to a WebP file using a
 quality factor of 80 on a 0->100 scale (0 being the lowest quality,
 100 being the best. Default value is 75).

@@ -112,14 +139,21 @@ options:
  -f <int> ............... filter strength (0=off..100)
  -sharpness <int> ....... filter sharpness (0:most .. 7:least sharp)
  -strong ................ use strong filter instead of simple.
+  -partition_limit <int> . limit quality to fit the 512k limit on
+                           the first partition (0=no degradation ... 100=full)
+  -alpha_comp <int> ...... set the transparency-compression
+  -noalpha ............... discard any transparency information.
  -pass <int> ............ analysis pass number (1..10)
  -partitions <int> ...... number of partitions to use (0..3)
  -crop <x> <y> <w> <h> .. crop picture with the given rectangle
+  -resize <w> <h> ........ resize picture (after any cropping)
  -map <int> ............. print map of extra info.
  -d <file.pgm> .......... dump the compressed output (PGM file).

  -short ................. condense printed message
  -quiet ................. don't print anything.
+  -version ............... print version number and exit.
+  -noasm ................. disable all assembly optimizations.
  -v ..................... verbose, e.g. print encoding/decoding times

 Experimental Options:
@@ -137,7 +171,7 @@ visual quality are:
 -m

 Namely:
-  * 'preset' will set up a default encoding configuration targetting a
+  * 'preset' will set up a default encoding configuration targeting a
     particular type of input. It should appear first in the list of options,
     so that subsequent options can take effect on top of this preset.
     Default value is 'default'.
@@ -161,7 +195,7 @@ Namely:
 Decoding tool:
 ==============

-There is a decoding sample code as examples/dwebp.c which will take
+There is a decoding sample in examples/dwebp.c which will take
 a .webp file and decode it to a PNG image file (amongst other formats).
 This is simply to demonstrate the use of the API. You can verify the
 file test.webp decodes to exactly the same as test_ref.ppm by using:
@@ -170,9 +204,29 @@ file test.webp decodes to exactly the same as test_ref.ppm by using:
 ./dwebp test.webp -ppm -o test.ppm
 diff test.ppm test_ref.ppm

+The full list of options is available using -h:
+
+> dwebp -h
+Usage: dwebp in_file [options] [-o out_file]
+
+Decodes the WebP image file to PNG format [Default]
+Use following options to convert into alternate image formats:
+  -ppm ......... save the raw RGB samples as color PPM
+  -pgm ......... save the raw YUV samples as a grayscale PGM
+                 file with IMC4 layout.
+ Other options are:
+  -version  .... print version number and exit.
+  -nofancy ..... don't use the fancy YUV420 upscaler.
+  -nofilter .... disable in-loop filtering.
+  -mt .......... use multi-threading
+  -crop <x> <y> <w> <h> ... crop output with the given rectangle
+  -scale <w> <h> .......... scale the output (*after* any cropping)
+  -h     ....... this help message.
+  -v     ....... verbose (e.g. print encoding/decoding times)
+  -noasm ....... disable all assembly optimizations.

 Encoding API:
-===========
+=============

 The main encoding functions are available in the header src/webp/encode.h
 The ready-to-use ones are:
@@ -188,10 +242,12 @@ size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
 They will convert raw RGB samples to a WebP data. The only control supplied
 is the quality factor.

+Advanced encoding API:
+----------------------

 A more advanced API is based on the WebPConfig and WebPPicture structures.

-WebPConfig contains the encoding settings and is not tied a to a particular
+WebPConfig contains the encoding settings and is not tied to a particular
 picture.
 WebPPicture contains input data, on which some WebPConfig will be used for
 compression.
@@ -210,7 +266,7 @@ The encoding flow looks like:
  // ... additional tuning
  config.sns_strength = 90;
  config.filter_sharpness = 6;
-  config_error = WebPValidateConfig(&config);  // not mandartory, but useful
+  config_error = WebPValidateConfig(&config);  // not mandatory, but useful

  // Setup the input data
  WebPPicture pic;
@@ -223,14 +279,13 @@ The encoding flow looks like:
  if (!WebPPictureAllocate(&pic)) {
    return 0;   // memory error
  }
-  // add that point, 'pic' has been initialized as a container,
+  // at this point, 'pic' has been initialized as a container,
  // and can receive the Y/U/V samples.
  // Alternatively, one could use ready-made import functions like
  // WebPPictureImportRGB(), which will take care of memory allocation.
  // In any case, past this point, one will have to call
  // WebPPictureFree(&pic) to reclaim memory.

-
  // Set up a byte-output write method. WebPMemoryWriter, for instance.
  WebPMemoryWriter wrt;
  pic.writer = MyFileWriter;
@@ -238,14 +293,13 @@ The encoding flow looks like:
  // initialize 'wrt' here...

  // Compress!
-  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occured!
+  int ok = WebPEncode(&config, &pic);   // ok = 0 => error occurred!
  WebPPictureFree(&pic);  // must be called independently of the 'ok' result.

  // output data should have been handled by the writer at that point.

 -------------------------------------- END PSEUDO EXAMPLE

-
 Decoding API:
 =============

@@ -256,9 +310,9 @@ uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
                       int *width, int *height);

 Please have a look at the file src/webp/decode.h for the details.
-There are variants for decoding in BGR/RGBA/BGRA order, along with decoding to
-raw Y'CbCr samples. One can also decode the image directly into a pre-allocated
-buffer.
+There are variants for decoding in BGR/RGBA/ARGB/BGRA order, along with
+decoding to raw Y'CbCr samples. One can also decode the image directly into a
+pre-allocated buffer.

 To detect a WebP file and gather picture's dimensions, the function:
  int WebPGetInfo(const uint8_t* data, uint32_t data_size,
@@ -291,14 +345,14 @@ or by just mentioning the new size of the transmitted data:
  WebPIUpdate(idec, buffer, size_of_transmitted_buffer);

 Note that 'buffer' can be modified between each calls to WebPIUpdate, in
-particular when the buffer is resized to accomodate larger data.
+particular when the buffer is resized to accommodate larger data.

 These functions will return the decoding status: either VP8_STATUS_SUSPENDED if
 decoding is not finished yet, or VP8_STATUS_OK when decoding is done.
 Any other status is an error condition.

 The idec object must always be released (even upon an error condition)
-by calling: WebPDelete(idec)
+by calling: WebPDelete(idec).

 To retrieve partially decoded picture samples, one must use the corresponding
 method: WebPIDecGetRGB or WebPIDecGetYUV.
@@ -310,6 +364,72 @@ WebPINewRGB() or WebPINewYUV().

 Please have a look at the src/webp/decode.h header for further details.

+Advanced Decoding API:
+======================
+
+WebP decoding supports an advanced API which provides on-the-fly cropping and
+rescaling, something of great usefulness on memory-constrained environments like
+mobile phones. Basically, the memory usage will scale with the output's size,
+not the input's, when one only needs a quick preview or a zoomed in portion of
+an otherwise too-large picture. Some CPU can be saved too, incidentally.
+
+-------------------------------------- BEGIN PSEUDO EXAMPLE
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config' options, if needed
+     config.options.no_fancy_upsampling = 1;
+     config.options.use_scaling = 1;
+     config.options.scaled_width = scaledWidth();
+     config.options.scaled_height = scaledHeight();
+     // etc.
+
+     // D) Specify 'config' output options for specifying output colorspace.
+     // Optionally the external image decode buffer can also be specified.
+     config.output.colorspace = MODE_BGRA;
+     // Optionally, the config.output can be pointed to an external buffer as
+     // well for decoding the image. This externally supplied memory buffer
+     // should be big enough to store the decoded picture.
+     config.output.u.RGBA.rgba = (uint8_t*) memory_buffer;
+     config.output.u.RGBA.stride = scanline_stride;
+     config.output.u.RGBA.size = total_size_of_the_memory_buffer;
+     config.output.is_external_memory = 1;
+
+     // E) Decode the WebP image. There are two variants w.r.t decoding image.
+     // The first one (E.1) decodes the full image and the second one (E.2) is
+     // used to incrementally decode the image using small input buffers.
+     // Any one of these steps can be used to decode the WebP image.
+
+     // E.1) Decode full image.
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E.2) Decode image incrementally.
+     WebPIDecoder* const idec = WebPIDecode(NULL, NULL, &config);
+     CHECK(idec != NULL);
+     while (bytes_remaining > 0) {
+       VP8StatusCode status = WebPIAppend(idec, input, bytes_read);
+       if (status == VP8_STATUS_OK || status == VP8_STATUS_SUSPENDED) {
+         bytes_remaining -= bytes_read;
+       } else {
+         break;
+       }
+     }
+     WebPIDelete(idec);
+
+     // F) Decoded image is now in config.output (and config.output.u.RGBA).
+     // It can be saved, displayed or otherwise processed.
+
+     // G) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+
+-------------------------------------- END PSEUDO EXAMPLE
+
 Bugs:
 =====

@@ -322,3 +442,4 @@ Discuss:
 ========

 Email: webp-discuss@webmproject.org
+Web: http://groups.google.com/a/webmproject.org/group/webp-discuss
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,7 @@
-AC_INIT([webpdecode], [0.1])
+AC_INIT([libwebp], [0.1.3],
+        [http://code.google.com/p/webp/issues],,
+        [http://code.google.com/speed/webp])
+AC_CANONICAL_TARGET
 AM_INIT_AUTOMAKE([-Wall foreign subdir-objects])
 AC_PROG_LIBTOOL
 AM_PROG_CC_C_O
@@ -8,6 +11,15 @@ AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=PATH],
 	[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
 AC_SUBST([pkgconfigdir])

+dnl === Check libz is present
+
+if test "$enable_experimental" = "yes"; then
+  AC_CHECK_HEADER(zlib.h,
+    AC_CHECK_LIB(z, gzsetparams,,AC_MSG_ERROR(zlib library not found)),
+    AC_MSG_ERROR(zlib not available - no zlib.h)
+  )
+fi
+
 dnl === check for PNG support ===

 PNG_INCLUDES=""
@@ -28,6 +40,11 @@ AC_ARG_WITH(pnglibdir,
            [--with-pnglibdir=DIR    use PNG libraries from DIR],
            [PNG_LIBS="-L$withval"])

+SAVED_CPPFLAGS=$CPPFLAGS
+SAVED_LIBS=$LIBS
+CPPFLAGS="$PNG_INCLUDES $CPPFLAGS"
+LIBS="$PNG_LIBS $LIBS"
+
 AC_CHECK_HEADER(png.h,
  AC_CHECK_LIB(png, main,
               [PNG_LIBS="$PNG_LIBS -lpng"
@@ -41,6 +58,9 @@ AC_CHECK_HEADER(png.h,
 AC_SUBST(PNG_LIBS)
 AC_SUBST(PNG_INCLUDES)

+CPPFLAGS=$SAVED_CPPFLAGS
+LIBS=$SAVED_LIBS
+
 dnl === check for JPEG support ===

 JPEG_INCLUDES=""
@@ -52,6 +72,11 @@ AC_ARG_WITH(jpeglibdir,
            [--with-jpeglibdir=DIR    use JPEG libraries from DIR],
            [JPEG_LIBS="-L$withval"])

+SAVED_CPPFLAGS=$CPPFLAGS
+SAVED_LIBS=$LIBS
+CPPFLAGS="$JPEG_INCLUDES $CPPFLAGS"
+LIBS="$JPEG_LIBS $LIBS"
+
 AC_CHECK_HEADER(jpeglib.h,
  AC_CHECK_LIB(jpeg, jpeg_set_defaults,
               [JPEG_LIBS="$JPEG_LIBS -ljpeg"
@@ -65,9 +90,73 @@ AC_CHECK_HEADER(jpeglib.h,
 AC_SUBST(JPEG_LIBS)
 AC_SUBST(JPEG_INCLUDES)

+CPPFLAGS=$SAVED_CPPFLAGS
+LIBS=$SAVED_LIBS
+
+dnl === check for WIC support ===
+
+if test "$target_os" = "mingw32"; then
+  AC_CHECK_HEADERS([wincodec.h shlwapi.h windows.h])
+  if test "$ac_cv_header_wincodec_h" = "yes"; then
+    AC_MSG_CHECKING(for Windows Imaging Component support)
+    SAVED_LIBS=$LIBS
+    LIBS="-lshlwapi -lole32 $LIBS"
+    # match include structure from [cd]webp.c
+    wic_headers="
+      #define INITGUID
+      #define CINTERFACE
+      #define COBJMACROS
+      #define _WIN32_IE 0x500
+
+      #include <shlwapi.h>
+      #include <windows.h>
+      #include <wincodec.h>
+      "
+    # test for functions from each lib and the GUID is created properly
+    wic_main="
+      int main(void) {
+        CLSID_WICImagingFactory;
+        CoInitialize(NULL);
+        SHCreateStreamOnFile(NULL, 0, NULL);
+        return 0;
+      }
+      "
+    AC_LANG_PUSH(C)
+    AC_LINK_IFELSE(
+      [AC_LANG_SOURCE([
+         $wic_headers
+         $wic_main])],
+      [wic_support=yes],
+      [wic_support=no]
+    )
+    AC_LANG_POP
+
+    test "$wic_support" = "yes" || LIBS=$SAVED_LIBS
+    AC_MSG_RESULT(${wic_support-no})
+  fi
+fi
+
+dnl === If --enable-experimental is defined, add the flag WEBP_EXPERIMENTAL_FEATURES
+
+USE_EXPERIMENTAL_CODE=""
+AC_MSG_CHECKING(if --enable-experimental option is specified)
+AC_ARG_ENABLE(experimental, [  --enable-experimental         Activate experimental features])
+if test "$enable_experimental" = "yes"; then
+        AC_DEFINE(EXPERIMENTAL,,[Enable experimental code])
+        USE_EXPERIMENTAL_CODE="-DWEBP_EXPERIMENTAL_FEATURES"
+fi
+AC_MSG_RESULT(${enable_experimental-no})
+AC_SUBST(USE_EXPERIMENTAL_CODE)
+
+dnl =========================
+
 AC_CONFIG_MACRO_DIR([m4])
 AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_FILES([Makefile src/Makefile man/Makefile examples/Makefile src/dec/Makefile src/enc/Makefile src/libwebp.pc])
+AC_CONFIG_FILES([Makefile src/Makefile man/Makefile \
+                 examples/Makefile src/dec/Makefile \
+                 src/enc/Makefile src/dsp/Makefile \
+                 src/utils/Makefile \
+                 src/libwebp.pc])


 AC_OUTPUT
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -3,9 +3,9 @@ AM_CPPFLAGS = -I$(top_srcdir)/src
 bin_PROGRAMS = dwebp cwebp

 dwebp_SOURCES = dwebp.c stopwatch.h
-dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
+dwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
 dwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)

 cwebp_SOURCES = cwebp.c stopwatch.h
-cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES)
+cwebp_CPPFLAGS = $(AM_CPPFLAGS) $(PNG_INCLUDES) $(JPEG_INCLUDES) $(USE_EXPERIMENTAL_CODE)
 cwebp_LDADD = ../src/libwebp.la $(PNG_LIBS) $(JPEG_LIBS)
--- a/examples/cwebp.c
+++ b/examples/cwebp.c
@@ -14,6 +14,10 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #ifdef WEBP_HAVE_PNG
 #include <png.h>
 #endif
@@ -23,7 +27,10 @@
 #include <jpeglib.h>
 #endif

-#ifdef _WIN32
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
 #define CINTERFACE
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
@@ -31,13 +38,26 @@
 #include <shlwapi.h>
 #include <windows.h>
 #include <wincodec.h>
+
+#ifndef GUID_WICPixelFormat24bppRGB
+// From Microsoft SDK 7.0a
+DEFINE_GUID(GUID_WICPixelFormat24bppRGB,
+    0x6fddc324, 0x4e03, 0x4bfe, 0xb1, 0x85, 0x3d, 0x77, 0x76, 0x8d, 0xc9, 0x0d);
 #endif
+#ifndef GUID_WICPixelFormat32bppRGBA
+DEFINE_GUID(GUID_WICPixelFormat32bppRGBA,
+    0xf5c7ad2d, 0x6a8d, 0x43dd, 0xa7, 0xa8, 0xa2, 0x99, 0x35, 0x26, 0x1a, 0xe9);
+#endif
+#endif  /* HAVE_WINCODEC_H */


 #include "webp/encode.h"
 #include "stopwatch.h"
+#ifndef WEBP_DLL
+extern void* VP8GetCPUInfo;   // opaque forward declaration.
+#endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static int verbose = 0;

@@ -68,7 +88,7 @@ static int ReadYUV(FILE* in_file, WebPPicture* const pic) {
  return ok;
 }

-#ifdef _WIN32
+#ifdef HAVE_WINCODEC_H

 #define IFS(fn)                \
  do {                         \
@@ -95,7 +115,7 @@ static HRESULT OpenInputStream(const char* filename, IStream** ppStream) {
 }

 static HRESULT ReadPictureWithWIC(const char* filename,
-                                  WebPPicture* const pic) {
+                                  WebPPicture* const pic, int keep_alpha) {
  HRESULT hr = S_OK;
  IWICBitmapFrameDecode* pFrame = NULL;
  IWICFormatConverter* pConverter = NULL;
@@ -105,6 +125,15 @@ static HRESULT ReadPictureWithWIC(const char* filename,
  UINT frameCount = 0;
  UINT width, height = 0;
  BYTE* rgb = NULL;
+  WICPixelFormatGUID srcPixelFormat = { 0 };
+  GUID srcContainerFormat = { 0 };
+  const GUID* alphaContainers[] = {
+    &GUID_ContainerFormatBmp,
+    &GUID_ContainerFormatPng,
+    &GUID_ContainerFormatTiff
+  };
+  int has_alpha = 0;
+  int i, stride;

  IFS(CoInitialize(NULL));
  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
@@ -125,28 +154,53 @@ static HRESULT ReadPictureWithWIC(const char* filename,
    hr = E_FAIL;
  }
  IFS(IWICBitmapDecoder_GetFrame(pDecoder, 0, &pFrame));
+  IFS(IWICBitmapFrameDecode_GetPixelFormat(pFrame, &srcPixelFormat));
+  IFS(IWICBitmapDecoder_GetContainerFormat(pDecoder, &srcContainerFormat));
+
+  has_alpha = keep_alpha;
+  for (i = 0;
+       has_alpha && i < sizeof(alphaContainers)/sizeof(alphaContainers[0]);
+       ++i) {
+    if (IsEqualGUID(&srcContainerFormat, alphaContainers[i])) {
+      has_alpha =
+          IsEqualGUID(&srcPixelFormat, &GUID_WICPixelFormat32bppRGBA) ||
+          IsEqualGUID(&srcPixelFormat, &GUID_WICPixelFormat32bppBGRA);
+      break;
+    }
+  }

  // Prepare for pixel format conversion (if necessary).
  IFS(IWICImagingFactory_CreateFormatConverter(pFactory, &pConverter));
  IFS(IWICFormatConverter_Initialize(pConverter, (IWICBitmapSource*)pFrame,
-          MAKE_REFGUID(GUID_WICPixelFormat24bppRGB), WICBitmapDitherTypeNone,
+          has_alpha ? MAKE_REFGUID(GUID_WICPixelFormat32bppRGBA)
+                    : MAKE_REFGUID(GUID_WICPixelFormat24bppRGB),
+          WICBitmapDitherTypeNone,
          NULL, 0.0, WICBitmapPaletteTypeCustom));

  // Decode.
  IFS(IWICFormatConverter_GetSize(pConverter, &width, &height));
+  stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
  if (SUCCEEDED(hr)) {
-    rgb = (BYTE*)malloc(3 * width * height);
+    rgb = (BYTE*)malloc(stride * height);
    if (rgb == NULL)
      hr = E_OUTOFMEMORY;
  }
-  IFS(IWICFormatConverter_CopyPixels(pConverter, NULL, 3 * width,
-          3 * width * height, rgb));
+  IFS(IWICFormatConverter_CopyPixels(pConverter, NULL, stride,
+          stride * height, rgb));

  // WebP conversion.
  if (SUCCEEDED(hr)) {
+    int ok;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (has_alpha) {
+      pic->colorspace |= WEBP_CSP_ALPHA_BIT;
+    }
+#endif
    pic->width = width;
    pic->height = height;
-    if (!WebPPictureImportRGB(pic, rgb, 3 * width))
+    ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
+                   : WebPPictureImportRGB(pic, rgb, stride);
+    if (!ok)
      hr = E_FAIL;
  }

@@ -160,7 +214,8 @@ static HRESULT ReadPictureWithWIC(const char* filename,
  return hr;
 }

-static int ReadPicture(const char* const filename, WebPPicture* const pic) {
+static int ReadPicture(const char* const filename, WebPPicture* const pic,
+                       int keep_alpha) {
  int ok;
  if (pic->width != 0 && pic->height != 0) {
    // If image size is specified, infer it as YUV format.
@@ -173,7 +228,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
    fclose(in_file);
  } else {
    // If no size specified, try to decode it using WIC.
-    ok = SUCCEEDED(ReadPictureWithWIC(filename, pic));
+    ok = SUCCEEDED(ReadPictureWithWIC(filename, pic, keep_alpha));
  }
  if (!ok) {
    fprintf(stderr, "Error! Could not process file %s\n", filename);
@@ -181,7 +236,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
  return ok;
 }

-#else  // !_WIN32
+#else  // !HAVE_WINCODEC_H

 #ifdef WEBP_HAVE_JPEG
 struct my_error_mgr {
@@ -268,6 +323,8 @@ static int ReadJPEG(FILE* in_file, WebPPicture* const pic) {

 #else
 static int ReadJPEG(FILE* in_file, WebPPicture* const pic) {
+  (void)in_file;
+  (void)pic;
  printf("JPEG support not compiled. Please install the libjpeg development "
         "package before building.\n");
  return 0;
@@ -280,10 +337,11 @@ static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
  longjmp(png_jmpbuf(png), 1);
 }

-static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
+static int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha) {
  png_structp png;
  png_infop info;
  int color_type, bit_depth, interlaced;
+  int has_alpha;
  int num_passes;
  int p;
  int ok = 0;
@@ -324,13 +382,24 @@ static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
  }
  if (png_get_valid(png, info, PNG_INFO_tRNS)) {
    png_set_tRNS_to_alpha(png);
+    has_alpha = 1;
+  } else {
+    has_alpha = !!(color_type & PNG_COLOR_MASK_ALPHA);
  }

-  // TODO(skal): Strip Alpha for now (till Alpha is supported).
-  png_set_strip_alpha(png);
+  if (!keep_alpha) {
+    png_set_strip_alpha(png);
+    has_alpha = 0;
+  }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (has_alpha) {
+    pic->colorspace |= WEBP_CSP_ALPHA_BIT;
+  }
+#endif
+
  num_passes = png_set_interlace_handling(png);
  png_read_update_info(png, info);
-  stride = 3 * width * sizeof(*rgb);
+  stride = (has_alpha ? 4 : 3) * width * sizeof(*rgb);
  rgb = (uint8_t*)malloc(stride * height);
  if (rgb == NULL) goto Error;
  for (p = 0; p < num_passes; ++p) {
@@ -344,14 +413,18 @@ static int ReadPNG(FILE* in_file, WebPPicture* const pic) {

  pic->width = width;
  pic->height = height;
-  ok = WebPPictureImportRGB(pic, rgb, stride);
+  ok = has_alpha ? WebPPictureImportRGBA(pic, rgb, stride)
+                 : WebPPictureImportRGB(pic, rgb, stride);
  free(rgb);

 End:
  return ok;
 }
 #else
-static int ReadPNG(FILE* in_file, WebPPicture* const pic) {
+static int ReadPNG(FILE* in_file, WebPPicture* const pic, int keep_alpha) {
+  (void)in_file;
+  (void)pic;
+  (void)keep_alpha;
  printf("PNG support not compiled. Please install the libpng development "
         "package before building.\n");
  return 0;
@@ -383,7 +456,8 @@ static InputFileFormat GetImageType(FILE* in_file) {
  return format;
 }

-static int ReadPicture(const char* const filename, WebPPicture* const pic) {
+static int ReadPicture(const char* const filename, WebPPicture* const pic,
+                       int keep_alpha) {
  int ok = 0;
  FILE* in_file = fopen(filename, "rb");
  if (in_file == NULL) {
@@ -395,7 +469,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
    // If no size specified, try to decode it as PNG/JPEG (as appropriate).
    const InputFileFormat format = GetImageType(in_file);
    if (format == PNG) {
-      ok = ReadPNG(in_file, pic);
+      ok = ReadPNG(in_file, pic, keep_alpha);
    } else if (format == JPEG) {
      ok = ReadJPEG(in_file, pic);
    }
@@ -411,7 +485,7 @@ static int ReadPicture(const char* const filename, WebPPicture* const pic) {
  return ok;
 }

-#endif  // !_WIN32
+#endif  // !HAVE_WINCODEC_H

 static void AllocExtraInfo(WebPPicture* const pic) {
  const int mb_w = (pic->width + 15) / 16;
@@ -473,6 +547,14 @@ static void PrintExtraInfo(const WebPPicture* const pic, int short_output) {
              100.f * stats->header_bytes[0] / stats->coded_size,
              stats->header_bytes[1],
              100.f * stats->header_bytes[1] / stats->coded_size);
+      if (stats->alpha_data_size) {
+        fprintf(stderr, "             transparency:   %6d\n",
+                stats->alpha_data_size);
+      }
+      if (stats->layer_data_size) {
+        fprintf(stderr, "             enhancement:    %6d\n",
+                stats->layer_data_size);
+      }
      fprintf(stderr, " Residuals bytes  "
                      "|segment 1|segment 2|segment 3"
                      "|segment 4|  total\n");
@@ -519,7 +601,7 @@ static void PrintExtraInfo(const WebPPicture* const pic, int short_output) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static int MyWriter(const uint8_t* data, size_t data_size,
                    const WebPPicture* const pic) {
@@ -533,7 +615,8 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
  const int uv_width = (picture->width + 1) / 2;
  const int uv_height = (picture->height + 1) / 2;
  const int stride = (picture->width + 1) & ~1;
-  const int height = picture->height + uv_height;
+  const int alpha_height = picture->a ? picture->height : 0;
+  const int height = picture->height + uv_height + alpha_height;
  FILE* const f = fopen(PGM_name, "wb");
  if (!f) return 0;
  fprintf(f, "P5\n%d %d\n255\n", stride, height);
@@ -548,11 +631,16 @@ static int DumpPicture(const WebPPicture* const picture, const char* PGM_name) {
    if (fwrite(picture->v + y * picture->uv_stride, uv_width, 1, f) != 1)
      return 0;
  }
+  for (y = 0; y < alpha_height; ++y) {
+    if (fwrite(picture->a + y * picture->a_stride, picture->width, 1, f) != 1)
+      return 0;
+    if (picture->width & 1) fputc(0, f);  // pad
+  }
  fclose(f);
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static void HelpShort(void) {
  printf("Usage:\n\n");
@@ -567,7 +655,7 @@ static void HelpLong(void) {
  printf(" cwebp [-preset <...>] [options] in_file [-o out_file]\n\n");
  printf("If input size (-s) for an image is not specified, "
         "it is assumed to be a PNG or JPEG file.\n");
-#ifdef _WIN32
+#ifdef HAVE_WINCODEC_H
  printf("Windows builds can take as input any of the files handled by WIC\n");
 #endif
  printf("options:\n");
@@ -581,6 +669,8 @@ static void HelpLong(void) {
  printf("\n");
  printf("  -m <int> ............... compression method (0=fast, 6=slowest)\n");
  printf("  -segments <int> ........ number of segments to use (1..4)\n");
+  printf("  -size <int> ............ Target size (in bytes)\n");
+  printf("  -psnr <float> .......... Target PSNR (in dB. typically: 42)\n");
  printf("\n");
  printf("  -s <int> <int> ......... Input size (width x height) for YUV\n");
  printf("  -sns <int> ............. Spatial Noise Shaping (0:off, 100:max)\n");
@@ -588,26 +678,60 @@ static void HelpLong(void) {
  printf("  -sharpness <int> ....... "
         "filter sharpness (0:most .. 7:least sharp)\n");
  printf("  -strong ................ use strong filter instead of simple.\n");
+  printf("  -partition_limit <int> . limit quality to fit the 512k limit on\n");
+  printf("                           "
+         "the first partition (0=no degradation ... 100=full)\n");
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -alpha_comp <int> ...... set the transparency-compression\n");
+  printf("  -noalpha ............... discard any transparency information.\n");
+#endif
  printf("  -pass <int> ............ analysis pass number (1..10)\n");
  printf("  -crop <x> <y> <w> <h> .. crop picture with the given rectangle\n");
+  printf("  -resize <w> <h> ........ resize picture (after any cropping)\n");
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  printf("  -444 / -422 / -gray ..... Change colorspace\n");
+#endif
  printf("  -map <int> ............. print map of extra info.\n");
  printf("  -d <file.pgm> .......... dump the compressed output (PGM file).\n");
+
  printf("\n");
  printf("  -short ................. condense printed message\n");
  printf("  -quiet ................. don't print anything.\n");
  printf("  -version ............... print version number and exit.\n");
+#ifndef WEBP_DLL
+  printf("  -noasm ................. disable all assembly optimizations.\n");
+#endif
  printf("  -v ..................... verbose, e.g. print encoding/decoding "
         "times\n");
  printf("\n");
  printf("Experimental Options:\n");
-  printf("  -size <int> ............ Target size (in bytes)\n");
-  printf("  -psnr <float> .......... Target PSNR (in dB. typically: 42)\n");
  printf("  -af .................... auto-adjust filter strength.\n");
  printf("  -pre <int> ............. pre-processing filter\n");
  printf("\n");
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Error messages
+
+static const char* const kErrorMessages[] = {
+  "OK",
+  "OUT_OF_MEMORY: Out of memory allocating objects",
+  "BITSTREAM_OUT_OF_MEMORY: Out of memory re-allocating byte buffer",
+  "NULL_PARAMETER: NULL parameter passed to function",
+  "INVALID_CONFIGURATION: configuration is invalid",
+  "BAD_DIMENSION: Bad picture dimension. Maximum width and height "
+  "allowed is 16383 pixels.",
+  "PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k.\n"
+  "To reduce the size of this partition, try using less segments "
+  "with the -segments option, and eventually reduce the number of "
+  "header bits using -partition_limit. More details are available "
+  "in the manual (`man cwebp`)",
+  "PARTITION_OVERFLOW: Partition is too big to fit 16M",
+  "BAD_WRITE: Picture writer returned an I/O error"
+  "FILE_TOO_BIG: File would be too big to fit in 4G"
+};
+
+//------------------------------------------------------------------------------

 int main(int argc, const char *argv[]) {
  const char *in_file = NULL, *out_file = NULL, *dump_file = NULL;
@@ -615,12 +739,18 @@ int main(int argc, const char *argv[]) {
  int c;
  int short_output = 0;
  int quiet = 0;
+  int keep_alpha = 0;
  int crop = 0, crop_x = 0, crop_y = 0, crop_w = 0, crop_h = 0;
+  int resize_w = 0, resize_h = 0;
  WebPPicture picture;
  WebPConfig config;
  WebPAuxStats stats;
  Stopwatch stop_watch;

+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  keep_alpha = 1;
+#endif
+
  if (!WebPPictureInit(&picture) || !WebPConfigInit(&config)) {
    fprintf(stderr, "Error! Version mismatch!\n");
    goto Error;
@@ -651,18 +781,18 @@ int main(int argc, const char *argv[]) {
    } else if (!strcmp(argv[c], "-m") && c < argc - 1) {
      config.method = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-q") && c < argc - 1) {
-      config.quality = strtod(argv[++c], NULL);
+      config.quality = (float)strtod(argv[++c], NULL);
    } else if (!strcmp(argv[c], "-size") && c < argc - 1) {
      config.target_size = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-psnr") && c < argc - 1) {
-      config.target_PSNR = strtod(argv[++c], NULL);
+      config.target_PSNR = (float)strtod(argv[++c], NULL);
    } else if (!strcmp(argv[c], "-sns") && c < argc - 1) {
      config.sns_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-f") && c < argc - 1) {
      config.filter_strength = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-af")) {
      config.autofilter = 1;
-    } else if (!strcmp(argv[c], "-strong") && c < argc - 1) {
+    } else if (!strcmp(argv[c], "-strong")) {
      config.filter_type = 1;
    } else if (!strcmp(argv[c], "-sharpness") && c < argc - 1) {
      config.filter_sharpness = strtol(argv[++c], NULL, 0);
@@ -672,14 +802,37 @@ int main(int argc, const char *argv[]) {
      config.preprocessing = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-segments") && c < argc - 1) {
      config.segments = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-partition_limit") && c < argc - 1) {
+      config.partition_limit = strtol(argv[++c], NULL, 0);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    } else if (!strcmp(argv[c], "-alpha_comp") && c < argc - 1) {
+      config.alpha_compression = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-noalpha")) {
+      keep_alpha = 0;
+#endif
    } else if (!strcmp(argv[c], "-map") && c < argc - 1) {
      picture.extra_info_type = strtol(argv[++c], NULL, 0);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    } else if (!strcmp(argv[c], "-444")) {
+      picture.colorspace = WEBP_YUV444;
+    } else if (!strcmp(argv[c], "-422")) {
+      picture.colorspace = WEBP_YUV422;
+    } else if (!strcmp(argv[c], "-gray")) {
+      picture.colorspace = WEBP_YUV400;
+#endif
    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
      crop = 1;
      crop_x = strtol(argv[++c], NULL, 0);
      crop_y = strtol(argv[++c], NULL, 0);
      crop_w = strtol(argv[++c], NULL, 0);
      crop_h = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-resize") && c < argc - 2) {
+      resize_w = strtol(argv[++c], NULL, 0);
+      resize_h = strtol(argv[++c], NULL, 0);
+#ifndef WEBP_DLL
+    } else if (!strcmp(argv[c], "-noasm")) {
+      VP8GetCPUInfo = NULL;
+#endif
    } else if (!strcmp(argv[c], "-version")) {
      const int version = WebPGetEncoderVersion();
      printf("%d.%d.%d\n",
@@ -727,9 +880,10 @@ int main(int argc, const char *argv[]) {
  }

  // Read the input
-  if (verbose)
+  if (verbose) {
    StopwatchReadAndReset(&stop_watch);
-  if (!ReadPicture(in_file, &picture)) {
+  }
+  if (!ReadPicture(in_file, &picture, keep_alpha)) {
    fprintf(stderr, "Error! Cannot read input picture\n");
    goto Error;
  }
@@ -761,15 +915,26 @@ int main(int argc, const char *argv[]) {
  picture.stats = &stats;

  // Compress
-  if (verbose)
+  if (verbose) {
    StopwatchReadAndReset(&stop_watch);
+  }
  if (crop != 0 && !WebPPictureCrop(&picture, crop_x, crop_y, crop_w, crop_h)) {
    fprintf(stderr, "Error! Cannot crop picture\n");
    goto Error;
  }
-  if (picture.extra_info_type > 0) AllocExtraInfo(&picture);
+  if ((resize_w | resize_h) > 0) {
+    if (!WebPPictureRescale(&picture, resize_w, resize_h)) {
+      fprintf(stderr, "Error! Cannot resize picture\n");
+      goto Error;
+    }
+  }
+  if (picture.extra_info_type > 0) {
+    AllocExtraInfo(&picture);
+  }
  if (!WebPEncode(&config, &picture)) {
    fprintf(stderr, "Error! Cannot encode picture as WebP\n");
+    fprintf(stderr, "Error code: %d (%s)\n",
+            picture.error_code, kErrorMessages[picture.error_code]);
    goto Error;
  }
  if (verbose) {
@@ -778,8 +943,12 @@ int main(int argc, const char *argv[]) {
  }

  // Write info
-  if (dump_file) DumpPicture(&picture, dump_file);
-  if (!quiet) PrintExtraInfo(&picture, short_output);
+  if (dump_file) {
+    DumpPicture(&picture, dump_file);
+  }
+  if (!quiet) {
+    PrintExtraInfo(&picture, short_output);
+  }

 Error:
  free(picture.extra_info);
@@ -791,4 +960,4 @@ int main(int argc, const char *argv[]) {
  return 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
--- a/examples/dwebp.c
+++ b/examples/dwebp.c
@@ -5,8 +5,7 @@
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-//  simple command-line example calling libwebpdecode to
-//  decode a WebP image into a PPM image.
+//  Command-line tool for decoding a WebP image
 //
 //  Compile with:     gcc -o dwebp dwebp.c -lwebpdecode
 //
@@ -17,11 +16,18 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #ifdef WEBP_HAVE_PNG
 #include <png.h>
 #endif

-#ifdef _WIN32
+#ifdef HAVE_WINCODEC_H
+#ifdef __MINGW32__
+#define INITGUID  // Without this GUIDs are declared extern and fail to link
+#endif
 #define CINTERFACE
 #define COBJMACROS
 #define _WIN32_IE 0x500  // Workaround bug in shlwapi.h when compiling C++
@@ -38,11 +44,22 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
-
 static int verbose = 0;
+#ifndef WEBP_DLL
+extern void* VP8GetCPUInfo;   // opaque forward declaration.
+#endif

-#ifdef _WIN32
+//------------------------------------------------------------------------------
+
+// Output types
+typedef enum {
+  PNG = 0,
+  PPM,
+  PGM,
+  ALPHA_PLANE_ONLY  // this is for experimenting only
+} OutputFileFormat;
+
+#ifdef HAVE_WINCODEC_H

 #define IFS(fn)                \
  do {                         \
@@ -60,7 +77,8 @@ static int verbose = 0;
 #define MAKE_REFGUID(x) &(x)
 #endif

-static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream) {
+static HRESULT CreateOutputStream(const char* out_file_name,
+                                  IStream** ppStream) {
  HRESULT hr = S_OK;
  IFS(SHCreateStreamOnFileA(out_file_name, STGM_WRITE | STGM_CREATE, ppStream));
  if (FAILED(hr))
@@ -70,13 +88,14 @@ static HRESULT CreateOutputStream(const char* out_file_name, IStream** ppStream)

 static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
                             unsigned char* rgb, int stride,
-                             uint32_t width, uint32_t height) {
+                             uint32_t width, uint32_t height, int has_alpha) {
  HRESULT hr = S_OK;
  IWICImagingFactory* pFactory = NULL;
  IWICBitmapFrameEncode* pFrame = NULL;
  IWICBitmapEncoder* pEncoder = NULL;
  IStream* pStream = NULL;
-  GUID pixel_format = GUID_WICPixelFormat24bppBGR;
+  WICPixelFormatGUID pixel_format = has_alpha ? GUID_WICPixelFormat32bppBGRA
+                                              : GUID_WICPixelFormat24bppBGR;

  IFS(CoInitialize(NULL));
  IFS(CoCreateInstance(MAKE_REFGUID(CLSID_WICImagingFactory), NULL,
@@ -108,21 +127,31 @@ static HRESULT WriteUsingWIC(const char* out_file_name, REFGUID container_guid,
  return hr;
 }

-static int WritePNG(const char* out_file_name, unsigned char* rgb, int stride,
-                    uint32_t width, uint32_t height) {
+static int WritePNG(const char* out_file_name,
+                    const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_BGRA);
+
  return SUCCEEDED(WriteUsingWIC(out_file_name,
             MAKE_REFGUID(GUID_ContainerFormatPng), rgb, stride, width,
-             height));
+             height, has_alpha));
 }

-#elif defined(WEBP_HAVE_PNG)    // !WIN32
+#elif defined(WEBP_HAVE_PNG)    // !HAVE_WINCODEC_H
 static void PNGAPI error_function(png_structp png, png_const_charp dummy) {
  (void)dummy;  // remove variable-unused warning
  longjmp(png_jmpbuf(png), 1);
 }

-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  const int has_alpha = (buffer->colorspace == MODE_RGBA);
  png_structp png;
  png_infop info;
  png_uint_32 y;
@@ -142,7 +171,8 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
    return 0;
  }
  png_init_io(png, out_file);
-  png_set_IHDR(png, info, width, height, 8, PNG_COLOR_TYPE_RGB,
+  png_set_IHDR(png, info, width, height, 8,
+               has_alpha ? PNG_COLOR_TYPE_RGBA : PNG_COLOR_TYPE_RGB,
               PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT,
               PNG_FILTER_TYPE_DEFAULT);
  png_write_info(png, info);
@@ -154,12 +184,13 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
  png_destroy_write_struct(&png, &info);
  return 1;
 }
-#else    // !WIN32 && !WEBP_HAVE_PNG
+#else    // !HAVE_WINCODEC_H && !WEBP_HAVE_PNG

 typedef uint32_t png_uint_32;

-static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
-                    png_uint_32 width, png_uint_32 height) {
+static int WritePNG(FILE* out_file, const WebPDecBuffer* const buffer) {
+  (void)out_file;
+  (void)buffer;
  printf("PNG support not compiled. Please install the libpng development "
         "package before building.\n");
  printf("You can run with -ppm flag to decode in PPM format.\n");
@@ -167,71 +198,172 @@ static int WritePNG(FILE* out_file, unsigned char* rgb, int stride,
 }
 #endif

-static int WritePPM(FILE* fout, unsigned char* rgb,
-                    uint32_t width, uint32_t height) {
+static int WritePPM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const rgb = buffer->u.RGBA.rgba;
+  const int stride = buffer->u.RGBA.stride;
+  uint32_t y;
  fprintf(fout, "P6\n%d %d\n255\n", width, height);
-  return (fwrite(rgb, width * height, 3, fout) == 3);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(rgb + y * stride, width, 3, fout) != 3) {
+      return 0;
+    }
+  }
+  return 1;
 }

-static int WritePGM(FILE* fout,
-                    unsigned char* y_plane, unsigned char *u, unsigned char* v,
-                    int y_stride, int uv_stride,
-                    uint32_t width, uint32_t height) {
+static int WriteAlphaPlane(FILE* fout, const WebPDecBuffer* const buffer) {
+  const uint32_t width = buffer->width;
+  const uint32_t height = buffer->height;
+  const unsigned char* const a = buffer->u.YUVA.a;
+  const int a_stride = buffer->u.YUVA.a_stride;
+  uint32_t y;
+  assert(a != NULL);
+  fprintf(fout, "P5\n%d %d\n255\n", width, height);
+  for (y = 0; y < height; ++y) {
+    if (fwrite(a + y * a_stride, width, 1, fout) != 1) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int WritePGM(FILE* fout, const WebPDecBuffer* const buffer) {
+  const int width = buffer->width;
+  const int height = buffer->height;
+  const WebPYUVABuffer* const yuv = &buffer->u.YUVA;
  // Save a grayscale PGM file using the IMC4 layout
  // (http://www.fourcc.org/yuv.php#IMC4). This is a very
  // convenient format for viewing the samples, esp. for
  // odd dimensions.
  int ok = 1;
-  unsigned int y;
-  const unsigned int uv_width = (width + 1) / 2;
-  const unsigned int uv_height = (height + 1) / 2;
-  const unsigned int out_stride = (width + 1) & ~1;
-  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height);
+  int y;
+  const int uv_width = (width + 1) / 2;
+  const int uv_height = (height + 1) / 2;
+  const int out_stride = (width + 1) & ~1;
+  const int a_height = yuv->a ? height : 0;
+  fprintf(fout, "P5\n%d %d\n255\n", out_stride, height + uv_height + a_height);
  for (y = 0; ok && y < height; ++y) {
-    ok &= (fwrite(y_plane + y * y_stride, width, 1, fout) == 1);
+    ok &= (fwrite(yuv->y + y * yuv->y_stride, width, 1, fout) == 1);
    if (width & 1) fputc(0, fout);    // padding byte
  }
  for (y = 0; ok && y < uv_height; ++y) {
-    ok &= (fwrite(u + y * uv_stride, uv_width, 1, fout) == 1);
-    ok &= (fwrite(v + y * uv_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->u + y * yuv->u_stride, uv_width, 1, fout) == 1);
+    ok &= (fwrite(yuv->v + y * yuv->v_stride, uv_width, 1, fout) == 1);
+  }
+  for (y = 0; ok && y < a_height; ++y) {
+    ok &= (fwrite(yuv->a + y * yuv->a_stride, width, 1, fout) == 1);
+    if (width & 1) fputc(0, fout);    // padding byte
  }
  return ok;
 }

-typedef enum {
-  PNG = 0,
-  PPM,
-  PGM,
-} OutputFileFormat;
+static void SaveOutput(const WebPDecBuffer* const buffer,
+                       OutputFileFormat format, const char* const out_file) {
+  FILE* fout = NULL;
+  int needs_open_file = 1;
+  int ok = 1;
+  Stopwatch stop_watch;
+
+  if (verbose)
+    StopwatchReadAndReset(&stop_watch);
+
+#ifdef HAVE_WINCODEC_H
+  needs_open_file = (format != PNG);
+#endif
+  if (needs_open_file) {
+    fout = fopen(out_file, "wb");
+    if (!fout) {
+      fprintf(stderr, "Error opening output file %s\n", out_file);
+      return;
+    }
+  }
+
+  if (format == PNG) {
+#ifdef HAVE_WINCODEC_H
+    ok &= WritePNG(out_file, buffer);
+#else
+    ok &= WritePNG(fout, buffer);
+#endif
+  } else if (format == PPM) {
+    ok &= WritePPM(fout, buffer);
+  } else if (format == PGM) {
+    ok &= WritePGM(fout, buffer);
+  } else if (format == ALPHA_PLANE_ONLY) {
+    ok &= WriteAlphaPlane(fout, buffer);
+  }
+  if (fout) {
+    fclose(fout);
+  }
+  if (ok) {
+    printf("Saved file %s\n", out_file);
+    if (verbose) {
+      const double time = StopwatchReadAndReset(&stop_watch);
+      printf("Time to write output: %.3fs\n", time);
+    }
+  } else {
+    fprintf(stderr, "Error writing file %s !!\n", out_file);
+  }
+}

 static void Help(void) {
-  printf("Usage: dwebp "
-         "[in_file] [-h] [-v] [-ppm] [-pgm] [-version] [-o out_file]\n\n"
+  printf("Usage: dwebp in_file [options] [-o out_file]\n\n"
         "Decodes the WebP image file to PNG format [Default]\n"
         "Use following options to convert into alternate image formats:\n"
-         " -ppm:  save the raw RGB samples as color PPM\n"
-         " -pgm:  save the raw YUV samples as a grayscale PGM\n"
-         "        file with IMC4 layout.\n"
-         " -version: print version number and exit.\n"
-         "Use -v for verbose (e.g. print encoding/decoding times)\n"
+         "  -ppm ......... save the raw RGB samples as color PPM\n"
+         "  -pgm ......... save the raw YUV samples as a grayscale PGM\n"
+         "                 file with IMC4 layout.\n"
+         " Other options are:\n"
+         "  -version  .... print version number and exit.\n"
+         "  -nofancy ..... don't use the fancy YUV420 upscaler.\n"
+         "  -nofilter .... disable in-loop filtering.\n"
+         "  -mt .......... use multi-threading\n"
+         "  -crop <x> <y> <w> <h> ... crop output with the given rectangle\n"
+         "  -scale <w> <h> .......... scale the output (*after* any cropping)\n"
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+         "  -alpha ....... only save the alpha plane.\n"
+#endif
+         "  -h     ....... this help message.\n"
+         "  -v     ....... verbose (e.g. print encoding/decoding times)\n"
+#ifndef WEBP_DLL
+         "  -noasm ....... disable all assembly optimizations.\n"
+#endif
        );
 }

+static const char* const kStatusMessages[] = {
+  "OK", "OUT_OF_MEMORY", "INVALID_PARAM", "BITSTREAM_ERROR",
+  "UNSUPPORTED_FEATURE", "SUSPENDED", "USER_ABORT", "NOT_ENOUGH_DATA"
+};
+
 int main(int argc, const char *argv[]) {
  const char *in_file = NULL;
  const char *out_file = NULL;

-  int width, height, stride, uv_stride;
-  uint8_t* out = NULL, *u = NULL, *v = NULL;
+  WebPDecoderConfig config;
+  WebPDecBuffer* const output_buffer = &config.output;
+  WebPBitstreamFeatures* const bitstream = &config.input;
  OutputFileFormat format = PNG;
-  Stopwatch stop_watch;
  int c;
+
+  if (!WebPInitDecoderConfig(&config)) {
+    fprintf(stderr, "Library version mismatch!\n");
+    return -1;
+  }
+
  for (c = 1; c < argc; ++c) {
    if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
      Help();
      return 0;
    } else if (!strcmp(argv[c], "-o") && c < argc - 1) {
      out_file = argv[++c];
+    } else if (!strcmp(argv[c], "-alpha")) {
+      format = ALPHA_PLANE_ONLY;
+    } else if (!strcmp(argv[c], "-nofancy")) {
+      config.options.no_fancy_upsampling = 1;
+    } else if (!strcmp(argv[c], "-nofilter")) {
+      config.options.bypass_filtering = 1;
    } else if (!strcmp(argv[c], "-ppm")) {
      format = PPM;
    } else if (!strcmp(argv[c], "-version")) {
@@ -241,8 +373,24 @@ int main(int argc, const char *argv[]) {
      return 0;
    } else if (!strcmp(argv[c], "-pgm")) {
      format = PGM;
+    } else if (!strcmp(argv[c], "-mt")) {
+      config.options.use_threads = 1;
+    } else if (!strcmp(argv[c], "-crop") && c < argc - 4) {
+      config.options.use_cropping = 1;
+      config.options.crop_left   = strtol(argv[++c], NULL, 0);
+      config.options.crop_top    = strtol(argv[++c], NULL, 0);
+      config.options.crop_width  = strtol(argv[++c], NULL, 0);
+      config.options.crop_height = strtol(argv[++c], NULL, 0);
+    } else if (!strcmp(argv[c], "-scale") && c < argc - 2) {
+      config.options.use_scaling = 1;
+      config.options.scaled_width  = strtol(argv[++c], NULL, 0);
+      config.options.scaled_height = strtol(argv[++c], NULL, 0);
    } else if (!strcmp(argv[c], "-v")) {
      verbose = 1;
+#ifndef WEBP_DLL
+    } else if (!strcmp(argv[c], "-noasm")) {
+      VP8GetCPUInfo = NULL;
+#endif
    } else if (argv[c][0] == '-') {
      printf("Unknown option '%s'\n", argv[c]);
      Help();
@@ -259,10 +407,13 @@ int main(int argc, const char *argv[]) {
  }

  {
+    Stopwatch stop_watch;
+    VP8StatusCode status = VP8_STATUS_OK;
+    int ok;
    uint32_t data_size = 0;
    void* data = NULL;
-    int ok;
    FILE* const in = fopen(in_file, "rb");
+
    if (!in) {
      fprintf(stderr, "cannot open input file '%s'\n", in_file);
      return 1;
@@ -274,97 +425,74 @@ int main(int argc, const char *argv[]) {
    ok = (fread(data, data_size, 1, in) == 1);
    fclose(in);
    if (!ok) {
+      fprintf(stderr, "Could not read %d bytes of data from file %s\n",
+              data_size, in_file);
      free(data);
      return -1;
    }

    if (verbose)
      StopwatchReadAndReset(&stop_watch);
+
+    status = WebPGetFeatures((const uint8_t*)data, data_size, bitstream);
+    if (status != VP8_STATUS_OK) {
+      goto end;
+    }
+
    switch (format) {
      case PNG:
-#ifdef _WIN32
-        out = WebPDecodeBGR((const uint8_t*)data, data_size, &width, &height);
+#ifdef HAVE_WINCODEC_H
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_BGRA : MODE_BGR;
 #else
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_RGBA : MODE_RGB;
 #endif
        break;
      case PPM:
-        out = WebPDecodeRGB((const uint8_t*)data, data_size, &width, &height);
+        output_buffer->colorspace = MODE_RGB;  // drops alpha for PPM
        break;
      case PGM:
-        out = WebPDecodeYUV((const uint8_t*)data, data_size, &width, &height,
-                            &u, &v, &stride, &uv_stride);
+        output_buffer->colorspace = bitstream->has_alpha ? MODE_YUVA : MODE_YUV;
+        break;
+      case ALPHA_PLANE_ONLY:
+        output_buffer->colorspace = MODE_YUVA;
        break;
      default:
        free(data);
        return -1;
    }
+    status = WebPDecode((const uint8_t*)data, data_size, &config);

    if (verbose) {
      const double time = StopwatchReadAndReset(&stop_watch);
      printf("Time to decode picture: %.3fs\n", time);
    }
-
+ end:
    free(data);
-  }
-
-  if (!out) {
-    fprintf(stderr, "Decoding of %s failed.\n", in_file);
-    return -1;
+    ok = (status == VP8_STATUS_OK);
+    if (!ok) {
+      fprintf(stderr, "Decoding of %s failed.\n", in_file);
+      fprintf(stderr, "Status: %d (%s)\n", status, kStatusMessages[status]);
+      return -1;
+    }
  }

  if (out_file) {
-    FILE* fout = NULL;
-    int needs_open_file = 0;
-
-    printf("Decoded %s. Dimensions: %d x %d. Now saving...\n", in_file, width, height);
-    StopwatchReadAndReset(&stop_watch);
-#ifdef _WIN32
-    if (format != PNG) {
-      needs_open_file = 1;
-    }
-#else
-    needs_open_file = 1;
-#endif
-    if (needs_open_file) fout = fopen(out_file, "wb");
-    if (!needs_open_file || fout) {
-      int ok = 1;
-      if (format == PNG) {
-#ifdef _WIN32
-        ok &= WritePNG(out_file, out, 3 * width, width, height);
-#else
-        ok &= WritePNG(fout, out, 3 * width, width, height);
-#endif
-      } else if (format == PPM) {
-        ok &= WritePPM(fout, out, width, height);
-      } else if (format == PGM) {
-        ok &= WritePGM(fout, out, u, v, stride, uv_stride, width, height);
-      }
-      if (fout)
-        fclose(fout);
-      if (ok) {
-        printf("Saved file %s\n", out_file);
-        if (verbose) {
-          const double time = StopwatchReadAndReset(&stop_watch);
-          printf("Time to write output: %.3fs\n", time);
-        }
-      } else {
-        fprintf(stderr, "Error writing file %s !!\n", out_file);
-      }
-    } else {
-      fprintf(stderr, "Error opening output file %s\n", out_file);
-    }
+    printf("Decoded %s. Dimensions: %d x %d%s. Now saving...\n", in_file,
+           output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
+    SaveOutput(output_buffer, format, out_file);
  } else {
-    printf("File %s can be decoded (dimensions: %d x %d).\n",
-           in_file, width, height);
+    printf("File %s can be decoded (dimensions: %d x %d)%s.\n",
+           in_file, output_buffer->width, output_buffer->height,
+           bitstream->has_alpha ? " (with alpha)" : "");
    printf("Nothing written; use -o flag to save the result as e.g. PNG.\n");
  }
-  free(out);
+  WebPFreeDecBuffer(output_buffer);

  return 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/examples/stopwatch.h
+++ b/examples/stopwatch.h
@@ -30,7 +30,7 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {
 }


-#else    // !_WIN32
+#else    /* !_WIN32 */
 #include <sys/time.h>

 typedef struct timeval Stopwatch;
@@ -42,6 +42,6 @@ static inline double StopwatchReadAndReset(Stopwatch* watch) {
      (watch->tv_usec - old_value.tv_usec) / 1000000.0;
 }

-#endif   // !_WIN32
+#endif   /* _WIN32 */

-#endif  // WEBP_EXAMPLES_STOPWATCH_H_
+#endif  /* WEBP_EXAMPLES_STOPWATCH_H_ */
--- a/makefile.unix
+++ b/makefile.unix
@@ -13,8 +13,8 @@
 # These flag assume you have libpng and libjpeg installed. If not, either
 # follow below install instructions or just comment out the next lines.
 EXTRA_FLAGS= -DWEBP_HAVE_PNG -DWEBP_HAVE_JPEG
-EXTRA_LIBS= -lpng -ljpeg
-ifeq ("$(HOSTTYPE)", "intel-mac")
+EXTRA_LIBS= -lpng -ljpeg -lz
+ifeq ($(strip $(shell uname)), Darwin)
  EXTRA_FLAGS += -I/opt/local/include
  EXTRA_LIBS  += -L/opt/local/lib
 endif
@@ -33,56 +33,103 @@ endif
 # 'make -f makefile.unix EXTRA_FLAGS=-m32' to that effect.
 # EXTRA_FLAGS += -m32

+# Extra flags to enable experimental features and code
+# EXTRA_FLAGS += -DWEBP_EXPERIMENTAL_FEATURES
+
+# Extra flags to enable multi-threading
+EXTRA_FLAGS += -DWEBP_USE_THREAD
+EXTRA_LIBS += -lpthread
+
 # Extra flags to emulate C89 strictness with the full ANSI
 EXTRA_FLAGS += -Wextra -Wold-style-definition
 EXTRA_FLAGS += -Wmissing-prototypes
 EXTRA_FLAGS += -Wmissing-declarations
 EXTRA_FLAGS += -Wdeclaration-after-statement
+# EXTRA_FLAGS += -Wvla

 #### Nothing should normally be changed below this line ####

+AR = ar
+ARFLAGS = r
 CC = gcc -Isrc/ -Iexamples/ -Wall
 CFLAGS = -O3 -DNDEBUG $(EXTRA_FLAGS)
-LDFLAGS = src/libwebp.a $(EXTRA_LIBS) -lm
+INSTALL = install
+LDFLAGS = $(EXTRA_LIBS) -lm
+
+DEC_OBJS = src/dec/frame.o src/dec/webp.o src/dec/quant.o src/dec/tree.o \
+           src/dec/vp8.o src/dec/idec.o src/dec/alpha.o src/dec/layer.o \
+           src/dec/io.o src/dec/buffer.o
+ENC_OBJS = src/enc/webpenc.o src/enc/syntax.o \
+           src/enc/alpha.o src/enc/layer.o \
+           src/enc/tree.o src/enc/config.o src/enc/frame.o \
+           src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
+           src/enc/cost.o src/enc/picture.o src/enc/filter.o
+DSP_OBJS = src/dsp/cpu.o src/dsp/enc.o \
+           src/dsp/enc_sse2.o src/dsp/dec.o src/dsp/dec_sse2.o \
+           src/dsp/dec_neon.o src/dsp/upsampling.o src/dsp/upsampling_sse2.o \
+           src/dsp/yuv.o
+UTILS_OBJS = src/utils/bit_reader.o src/utils/bit_writer.o src/utils/thread.o
+
+OBJS = $(DEC_OBJS) $(ENC_OBJS) $(DSP_OBJS) $(UTILS_OBJS)
+
+HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/cost.h \
+       src/dec/vp8i.h  \
+       src/dsp/yuv.h src/dsp/dsp.h \
+       src/utils/bit_writer.h src/utils/bit_reader.h src/utils/thread.h

-OBJS = src/enc/webpenc.o src/enc/bit_writer.o src/enc/syntax.o \
-       src/enc/dsp.o src/enc/tree.o src/enc/config.o src/enc/frame.o \
-       src/enc/quant.o src/enc/iterator.o src/enc/analysis.o \
-       src/enc/cost.o src/enc/picture.o src/enc/filter.o \
-       src/dec/bits.o src/dec/dsp.o src/dec/frame.o src/dec/webp.o \
-       src/dec/quant.o src/dec/tree.o src/dec/vp8.o src/dec/yuv.o \
-       src/dec/idec.o
-HDRS = src/webp/encode.h src/enc/vp8enci.h src/enc/bit_writer.h \
-       src/enc/cost.h src/dec/bits.h  src/dec/vp8i.h src/dec/yuv.h
 OUTPUT = examples/cwebp examples/dwebp src/libwebp.a

 all:ex

-.c.o:  $(HDRS)
+%.o: %.c $(HDRS)
 	$(CC) $(CFLAGS) -c $< -o $@

-libwebp.a:  $(OBJS) $(HDRS)
-	ar r src/libwebp.a $(OBJS)
+src/libwebp.a:  $(OBJS)
+	$(AR) $(ARFLAGS) $@ $^

-ex: examples/cwebp.o examples/dwebp.o libwebp.a
-	$(CC) -o examples/cwebp examples/cwebp.o $(LDFLAGS)
-	$(CC) -o examples/dwebp examples/dwebp.o $(LDFLAGS)
+ex: examples/cwebp examples/dwebp
+
+examples/cwebp: examples/cwebp.o src/libwebp.a
+examples/dwebp: examples/dwebp.o src/libwebp.a
+examples/cwebp examples/dwebp:
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+dist: DESTDIR := dist
+dist: all
+	$(INSTALL) -m755 -d $(DESTDIR)/include/webp \
+	    $(DESTDIR)/doc $(DESTDIR)/lib
+	$(INSTALL) -m755 -s examples/cwebp examples/dwebp $(DESTDIR)
+	$(INSTALL) -m644 src/webp/*.h $(DESTDIR)/include/webp
+	$(INSTALL) -m644 src/libwebp.a $(DESTDIR)/lib
+	umask 022; \
+	for m in man/[cd]webp.1; do \
+	  basenam=$$(basename $$m .1); \
+	  /usr/bin/groff -t -e -man -T utf8 $$m \
+	    | col -bx >$(DESTDIR)/doc/$${basenam}.txt; \
+	  /usr/bin/groff -t -e -man -T html $$m \
+	    | col -bx >$(DESTDIR)/doc/$${basenam}.html; \
+	done

 clean:
-	rm -f ${OUTPUT} *~ \
+	$(RM) ${OUTPUT} *~ \
              src/enc/*.o src/enc/*~ \
              src/dec/*.o src/dec/*~ \
+              src/dsp/*.o src/dsp/*~ \
+              src/utils/*.o src/utils/*~ \
              examples/*.o examples/*~

 superclean: clean
-	rm -rf .git *.log *.cache *~
-	rm -rf .deps */.deps */*/.deps
-	rm -rf .libs */.libs */*/.libs
-	rm -f */*.lo */*/*.lo
-	rm -f */*.la */*/*.la
-	rm -f Makefile */Makefile */*/Makefile
-	rm -f Makefile.in */Makefile.in */*/Makefile.in
-	rm -f config.log autom4te.cache libtool config.h stamp-h1
-	rm -f aclocal.m4 compile config.guess config.h.in config.sub config.status
-	rm -f configure depcomp install-sh ltmain.sh missing src/libwebp.pc
-	rm -f m4/*
+	$(RM) -r .git *.log *.cache *~
+	$(RM) -r .deps */.deps */*/.deps
+	$(RM) -r .libs */.libs */*/.libs
+	$(RM) */*.lo */*/*.lo
+	$(RM) */*.la */*/*.la
+	$(RM) Makefile */Makefile */*/Makefile
+	$(RM) Makefile.in */Makefile.in */*/Makefile.in
+	$(RM) config.log autom4te.cache libtool config.h stamp-h1
+	$(RM) aclocal.m4 compile config.guess config.h.in config.sub config.status
+	$(RM) configure depcomp install-sh ltmain.sh missing src/libwebp.pc
+	$(RM) m4/*
+
+.PHONY: all clean dist ex superclean
+.SUFFIXES:
--- a/man/cwebp.1
+++ b/man/cwebp.1
@@ -1,5 +1,5 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH CWEBP 1 "March  28, 2011"
+.TH CWEBP 1 "September 19, 2011"
 .SH NAME
 cwebp \- compress an image file to a WebP file
 .SH SYNOPSIS
@@ -11,7 +11,7 @@ This manual page documents the
 .B cwebp
 command.
 .PP
-\fBcwebp\fP compresses image using the WebP format.
+\fBcwebp\fP compresses an image using the WebP format.
 Input format can be either PNG, JPEG, or raw Y'CbCr samples.
 When using PNG, the transparency information (alpha channel) is currently
 discarded.
@@ -32,8 +32,8 @@ A summary of all the possible options.
 Print the version number (as major.minor.revision) and exit.
 .TP
 .B \-q float
-Specify the compression factor between 0 and 100. Small factor
-produce smaller file with lower quality. Best quality is achieved
+Specify the compression factor between 0 and 100. A small factor
+produces a smaller file with lower quality. Best quality is achieved
 using a value of 100. The default is 75.
 .TP
 .B \-f int
@@ -47,7 +47,7 @@ appear. Typical values are usually in the range of 20 to 50.
 Specify a set of pre-defined parameters to suit a particular type of
 source material. Possible values are:  \fBdefault\fP, \fBphoto\fP,
 \fBpicture\fP, \fBdrawing\fP, \fBicon\fP, \fBtext\fP. Since
-\fB\-preset\fP overwrites the other parameter's values (except the
+\fB\-preset\fP overwrites the other parameters' values (except the
 \fB\-q\fP one), this option should preferably appear first in the
 order of the arguments.
 .TP
@@ -86,6 +86,25 @@ used thanks to the \fB\-f\fP option). Strong filtering is off by default.
 Change the number of partitions to use during the segmentation of the
 sns algorithm. Segments should be in range 1 to 4. Default value is 4.
 .TP
+.B \-partition_limit int
+Degrade quality by limiting the number of bits used by some macroblocks.
+Range is 0 (no degradation, the default) to 100 (full degradation).
+Useful values are usually around 30-70 for moderately large images.
+In the VP8 format, the so-called control partition has a limit of 512k and
+is used to store the following information: whether the macroblock is skipped,
+which segment it belongs to, whether it is coded as intra 4x4 or intra 16x16
+mode, and finally the prediction modes to use for each of the sub-blocks.
+For a very large image, 512k only leaves room to few bits per 16x16 macroblock.
+The absolute minimum is 4 bits per macroblock. Skip, segment, and mode
+information can use up almost all these 4 bits (although the case is unlikely),
+which is problematic for very large images. The partition_limit factor controls
+how frequently the most bit-costly mode (intra 4x4) will be used. This is
+useful in case the 512k limit is reached and the following message is displayed:
+\fIError code: 6 (PARTITION0_OVERFLOW: Partition #0 is too big to fit 512k)\fP.
+If using \fB-partition_limit\fP is not enough to meet the 512k constraint, one
+should use less segments in order to save more header bits per macroblock.
+See the \fB-segments\fP option.
+.TP
 .B \-size int
 Specify a target size (in bytes) to try and reach for the compressed output.
 Compressor will make several pass of partial encoding in order to get as
@@ -102,8 +121,9 @@ options \fB\-size\fP or \fB\-psnr\fP. Maximum value is 10.
 .TP
 .B \-crop x_position y_position width height
 Crop the source to a rectangle with top-left corner at coordinates
-(x_position, y_position) and size width x height. This cropping area must
-be fully contained within the source rectangle.
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+.TP
 .B \-s width height
 Specify that the input file actually consists of raw Y'CbCr samples following
 the ITU-R BT.601 recommendation, in 4:2:0 linear format.
@@ -117,6 +137,9 @@ range from 1 to 6. This is only meant to help debugging.
 Specify a pre-processing filter. This option is a placeholder
 and has currently no effect.
 .TP
+.B \-noasm
+Disable all assembly optimizations.
+.TP
 .B \-v
 Print extra information (encoding time in particular).
 .TP
--- a/man/dwebp.1
+++ b/man/dwebp.1
@@ -1,7 +1,7 @@
 .\"                                      Hey, EMACS: -*- nroff -*-
-.TH DWEBP 1 "March  28, 2011"
+.TH DWEBP 1 "September 19, 2011"
 .SH NAME
-dwebp \- compress a WebP file to an image file
+dwebp \- decompress a WebP file to an image file
 .SH SYNOPSIS
 .B dwebp
 .RI [ options ] " input_file.webp
@@ -11,7 +11,7 @@ This manual page documents the
 .B dwebp
 command.
 .PP
-\fBdwebp\fP decompresses WebP files into PNG or PPM images.
+\fBdwebp\fP decompresses WebP files into PNG, PPM or PGM images.
 .SH OPTIONS
 The basic options are:
 .TP
@@ -32,8 +32,37 @@ Change the output format to PGM. The output consist of luma/chroma
 samples instead of RGB, using the ICM4 layout. This option is mainly
 for verification and debugging purpose.
 .TP
+.B \-nofancy
+Don't use the fancy upscaler for YUV420. This may lead to jaggy
+edges (especially the red ones), but should be faster.
+.TP
+.B \-nofilter
+Don't use the in-loop filtering process even if it is required by
+the bitstream. This may produce visible blocks on the non-compliant output,
+but will make the decoding faster.
+.TP
+.B \-mt
+Use multi-threading for decoding, if possible.
+.TP
+.B \-crop x_position y_position width height
+Crop the decoded picture to a rectangle with top-left corner at coordinates
+(\fBx_position\fP, \fBy_position\fP) and size \fBwidth\fP x \fBheight\fP.
+This cropping area must be fully contained within the source rectangle.
+The top-left corner will be snapped to even coordinates if needed.
+This option is meant to reduce the memory needed for cropping large images.
+Note: the cropping is applied \fIbefore\fP any scaling.
+.TP
+.B \-scale width height
+Rescale the decoded picture to dimension \fBwidth\fP x \fBheight\fP. This option is
+mostly intended to reducing the memory needed to decode large images,
+when only a small version is needed (thumbnail, preview, etc.).
+Note: scaling is applied \fIafter\fP cropping.
+.TP
 .B \-v
 Print extra information (decoding time in particular).
+.TP
+.B \-noasm
+Disable all assembly optimizations.

 .SH Examples:
 dwebp picture.webp -o output.png
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,12 +1,14 @@
-SUBDIRS = dec enc
+SUBDIRS = dec enc dsp utils

 AM_CPPFLAGS = -I$(top_srcdir)/src
 lib_LTLIBRARIES = libwebp.la

 libwebp_la_SOURCES =
 libwebp_la_LIBADD = dec/libwebpdecode.la \
-                    enc/libwebpencode.la
-libwebp_la_LDFLAGS = -version-info 0:0:0
+                    enc/libwebpencode.la \
+                    utils/libwebputils.la \
+                    dsp/libwebpdsp.la
+libwebp_la_LDFLAGS = -version-info 2:0:0
 libwebpinclude_HEADERS = webp/types.h webp/decode.h webp/decode_vp8.h \
                         webp/encode.h
 libwebpincludedir = $(includedir)/webp
--- a/src/dec/Makefile.am
+++ b/src/dec/Makefile.am
@@ -1,14 +1,13 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src

-libwebpdecode_la_SOURCES = bits.h vp8i.h yuv.h bits.c dsp.c frame.c \
-                          quant.c tree.c vp8.c webp.c yuv.c idec.c
-libwebpdecode_la_LDFLAGS = -version-info 0:0:0
+libwebpdecode_la_SOURCES = vp8i.h webpi.h \
+                           frame.c quant.c tree.c vp8.c webp.c \
+                           idec.c alpha.c layer.c io.c buffer.c
+libwebpdecode_la_LDFLAGS = -version-info 2:0:0
+libwebpdecode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
 libwebpdecodeinclude_HEADERS = ../webp/decode.h ../webp/decode_vp8.h ../webp/types.h
 libwebpdecodeincludedir = $(includedir)/webp

-noinst_HEADERS = bits.h vp8i.h webpi.h yuv.h
+noinst_HEADERS = vp8i.h webpi.h

 noinst_LTLIBRARIES = libwebpdecode.la
-# uncomment the following line (and comment the above) if you want
-# to install libwebpdecode library.
-#lib_LTLIBRARIES = libwebpdecode.la
--- a/src/dec/alpha.c
+++ b/src/dec/alpha.c
@@ -0,0 +1,69 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane decompression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+
+#include "zlib.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows) {
+  uint8_t* output = dec->alpha_plane_;
+  const int stride = dec->pic_hdr_.width_;
+  if (row < 0 || row + num_rows > dec->pic_hdr_.height_) {
+    return NULL;    // sanity check
+  }
+  if (row == 0) {
+    // TODO(skal): for now, we just decompress everything during the first call.
+    // Later, we'll decode progressively, but we need to store the
+    // z_stream state.
+    const uint8_t* data = dec->alpha_data_;
+    size_t data_size = dec->alpha_data_size_;
+    const size_t output_size = stride * dec->pic_hdr_.height_;
+    int ret = Z_OK;
+    z_stream strm;
+
+    memset(&strm, 0, sizeof(strm));
+    if (inflateInit(&strm) != Z_OK) {
+      return 0;
+    }
+    strm.avail_in = data_size;
+    strm.next_in = (unsigned char*)data;
+    do {
+      strm.avail_out = output_size;
+      strm.next_out = output;
+      ret = inflate(&strm, Z_NO_FLUSH);
+      if (ret == Z_NEED_DICT || ret == Z_DATA_ERROR || ret == Z_MEM_ERROR) {
+        break;
+      }
+    } while (strm.avail_out == 0);
+
+    inflateEnd(&strm);
+    if (ret != Z_STREAM_END) {
+      return NULL;    // error
+    }
+  }
+  return output + row * stride;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif    // WEBP_EXPERIMENTAL_FEATURES
--- a/src/dec/buffer.c
+++ b/src/dec/buffer.c
@@ -0,0 +1,198 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "vp8i.h"
+#include "webpi.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer
+
+// Number of bytes per pixel for the different color-spaces.
+static const int kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 1, 1 };
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (mode >= MODE_YUV) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int size = buf->y_stride * height;
+    const int u_size = buf->u_stride * ((height + 1) / 2);
+    const int v_size = buf->v_stride * ((height + 1) / 2);
+    const int a_size = buf->a_stride * height;
+    ok &= (size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (a_size <= buf->a_size);
+    ok &= (buf->y_stride >= width);
+    ok &= (buf->u_stride >= (width + 1) / 2);
+    ok &= (buf->v_stride >= (width + 1) / 2);
+    if (buf->a) {
+      ok &= (buf->a_stride >= width);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    ok &= (buf->stride * height <= buf->size);
+    ok &= (buf->stride >= width * kModeBpp[mode]);
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+
+  if (w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
+    uint8_t* output;
+    WEBP_CSP_MODE mode = buffer->colorspace;
+    int stride;
+    int uv_stride = 0, a_stride = 0;
+    int uv_size = 0;
+    uint64_t size, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    stride = w * kModeBpp[mode];
+    size = (uint64_t)stride * h;
+
+    if (mode >= MODE_YUV) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+
+    buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size);
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+
+    if (mode >= MODE_YUV) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w  = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real
+  return AllocateBuffer(out);
+}
+
+//------------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) {
+  if (version != WEBP_DECODER_ABI_VERSION) return 0;  // version mismatch
+  if (!buffer) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* const buffer) {
+  if (buffer) {
+    if (!buffer->is_external_memory)
+      free(buffer->private_memory);
+    buffer->private_memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->private_memory) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->private_memory = NULL;
+    }
+  }
+}
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
+  if (src && dst) {
+    *dst = *src;
+    if (src->private_memory) {
+      src->is_external_memory = 1;   // src relinquishes ownership
+      src->private_memory = NULL;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/frame.c
+++ b/src/dec/frame.c
@@ -10,7 +10,7 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <stdlib.h>
-#include "vp8i.h"
+#include "./vp8i.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -18,25 +18,84 @@ extern "C" {

 #define ALIGN_MASK (32 - 1)

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
+//
+// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
+// immediately, and needs to wait for first few rows of the next macroblock to
+// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
+// on strength).
+// With two threads, the vertical positions of the rows being decoded are:
+// Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
+// Deblock:         [ 0..11][12..27][28..43][44..59][...
+// If we use two threads and two caches of 16 pixels, the sequence would be:
+// Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
+// Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
+// The problem occurs during row [12..15!!] that both the decoding and
+// deblocking threads are writing simultaneously.
+// With 3 cache lines, one get a safe write pattern:
+// Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
+// Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
+// Note that multi-threaded output _without_ deblocking can make use of two
+// cache lines of 16 pixels only, since there's no lagging behind. The decoding
+// and output process have non-concurrent writing:
+// Decode:  [ 0..15][16..31][ 0..15][16..31][...
+// io->put:         [ 0..15][16..31][ 0..15][...
+
+#define MT_CACHE_LINES 3
+#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
+
+// Initialize multi/single-thread worker
+static int InitThreadContext(VP8Decoder* const dec) {
+  dec->cache_id_ = 0;
+  if (dec->use_threads_) {
+    WebPWorker* const worker = &dec->worker_;
+    if (!WebPWorkerReset(worker)) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "thread initialization failed.");
+    }
+    worker->data1 = dec;
+    worker->data2 = (void*)&dec->thread_ctx_.io_;
+    worker->hook = (WebPWorkerHook)VP8FinishRow;
+    dec->num_caches_ =
+      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
+  } else {
+    dec->num_caches_ = ST_CACHE_LINES;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
 // Memory setup

-// how many extra luma lines are needed for caching, given a filtering level
-static const uint8_t kFilterExtraRows[3] = { 0, 4, 8 };
+// kFilterExtraRows[] = How many extra lines are needed on the MB boundary
+// for caching, given a filtering level.
+// Simple filter:  up to 2 luma samples are read and 1 is written.
+// Complex filter: up to 4 luma samples are read and 3 are written. Same for
+//                 U/V, so it's 8 samples total (because of the 2x upsampling).
+static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };

-int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+static int AllocateMemory(VP8Decoder* const dec) {
+  const int num_caches = dec->num_caches_;
  const int mb_w = dec->mb_w_;
  const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
  const int top_size = (16 + 8 + 8) * mb_w;
-  const int info_size = (mb_w + 1) * sizeof(VP8MB);
+  const int mb_info_size = (mb_w + 1) * sizeof(VP8MB);
+  const int f_info_size =
+    (dec->filter_type_ > 0) ?
+        mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+      : 0;
  const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
  const int coeffs_size = 384 * sizeof(*dec->coeffs_);
-  const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const int cache_height = (16 * num_caches
+                         + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const int cache_size = top_size * cache_height;
+  const int alpha_size =
+    dec->alpha_data_ ? (dec->pic_hdr_.width_ * dec->pic_hdr_.height_) : 0;
  const int needed = intra_pred_mode_size
-                   + top_size + info_size
+                   + top_size + mb_info_size + f_info_size
                   + yuv_size + coeffs_size
-                   + cache_size + ALIGN_MASK;
+                   + cache_size + alpha_size + ALIGN_MASK;
  uint8_t* mem;

  if (needed > dec->mem_size_) {
@@ -62,7 +121,18 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
  mem += 8 * mb_w;

  dec->mb_info_ = ((VP8MB*)mem) + 1;
-  mem += info_size;
+  mem += mb_info_size;
+
+  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
+  mem += f_info_size;
+  dec->thread_ctx_.id_ = 0;
+  dec->thread_ctx_.f_info_ = dec->f_info_;
+  if (dec->use_threads_) {
+    // secondary cache line. The deblocking process need to make use of the
+    // filtering strength from previous macroblock row, while the new ones
+    // are being decoded in parallel. We'll just swap the pointers.
+    dec->thread_ctx_.f_info_ += mb_w;
+  }

  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
  assert((yuv_size & ALIGN_MASK) == 0);
@@ -79,36 +149,48 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
    const int extra_y = extra_rows * dec->cache_y_stride_;
    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
-    dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv;
-    dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_u_ = dec->cache_y_
+                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_
+                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_id_ = 0;
  }
  mem += cache_size;

+  // alpha plane
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  mem += alpha_size;
+
  // note: left-info is initialized once for all.
-  memset(dec->mb_info_ - 1, 0, (mb_w + 1) * sizeof(*dec->mb_info_));
+  memset(dec->mb_info_ - 1, 0, mb_info_size);

  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);

+  return 1;
+}
+
+static void InitIo(VP8Decoder* const dec, VP8Io* io) {
  // prepare 'io'
-  io->width = dec->pic_hdr_.width_;
-  io->height = dec->pic_hdr_.height_;
  io->mb_y = 0;
  io->y = dec->cache_y_;
  io->u = dec->cache_u_;
  io->v = dec->cache_v_;
  io->y_stride = dec->cache_y_stride_;
  io->uv_stride = dec->cache_uv_stride_;
-  io->fancy_upscaling = 0;    // default
-
-  // Init critical function pointers and look-up tables.
-  VP8DspInitTables();
-  VP8DspInit();
+  io->fancy_upsampling = 0;    // default
+  io->a = NULL;
+}

+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
+  if (!AllocateMemory(dec)) return 0;
+  InitIo(dec, io);
+  VP8DspInit();  // Init critical function pointers and look-up tables.
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Filtering

 static inline int hev_thresh_from_level(int level, int keyframe) {
@@ -119,12 +201,13 @@ static inline int hev_thresh_from_level(int level, int keyframe) {
  }
 }

-static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
-  VP8MB* const mb = dec->mb_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + mb_x * 16;
+static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int y_bps = dec->cache_y_stride_;
-  const int level = mb->f_level_;
-  const int ilevel = mb->f_ilevel_;
+  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  const int level = f_info->f_level_;
+  const int ilevel = f_info->f_ilevel_;
  const int limit = 2 * level + ilevel;
  if (level == 0) {
    return;
@@ -133,26 +216,26 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
    if (mb_x > 0) {
      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
    }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
      VP8SimpleHFilter16i(y_dst, y_bps, limit);
    }
    if (mb_y > 0) {
      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
    }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
      VP8SimpleVFilter16i(y_dst, y_bps, limit);
    }
  } else {    // complex
-    uint8_t* const u_dst = dec->cache_u_ + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + mb_x * 8;
    const int uv_bps = dec->cache_uv_stride_;
+    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
    const int hev_thresh =
        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
    if (mb_x > 0) {
      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    }
@@ -160,16 +243,29 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    }
-    if (mb->f_inner_) {
+    if (f_info->f_inner_) {
      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    }
  }
 }

+// Filter the decoded macroblock row (if needed)
+static void FilterRow(const VP8Decoder* const dec) {
+  int mb_x;
+  const int mb_y = dec->thread_ctx_.mb_y_;
+  assert(dec->thread_ctx_.filter_row_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    DoFilter(dec, mb_x, mb_y);
+  }
+}
+
+//------------------------------------------------------------------------------
+
 void VP8StoreBlock(VP8Decoder* const dec) {
  if (dec->filter_type_ > 0) {
-    VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+    VP8FInfo* const info = dec->f_info_ + dec->mb_x_;
+    const int skip = dec->mb_info_[dec->mb_x_].skip_;
    int level = dec->filter_levels_[dec->segment_];
    if (dec->filter_hdr_.use_lf_delta_) {
      // TODO(skal): only CURRENT is handled for now.
@@ -193,14 +289,16 @@ void VP8StoreBlock(VP8Decoder* const dec) {
    }

    info->f_ilevel_ = (level < 1) ? 1 : level;
-    info->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+    info->f_inner_ = (!skip || dec->is_i4x4_);
  }
  {
    // Transfer samples to row cache
    int y;
-    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16;
-    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8;
-    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8;
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
    for (y = 0; y < 16; ++y) {
      memcpy(ydst + y * dec->cache_y_stride_,
             dec->yuv_b_ + Y_OFF + y * BPS, 16);
@@ -214,56 +312,205 @@ void VP8StoreBlock(VP8Decoder* const dec) {
  }
 }

+//------------------------------------------------------------------------------
+// This function is called after a row of macroblocks is finished decoding.
+// It also takes into account the following restrictions:
+//  * In case of in-loop filtering, we must hold off sending some of the bottom
+//    pixels as they are yet unfiltered. They will be when the next macroblock
+//    row is decoded. Meanwhile, we must preserve them by rotating them in the
+//    cache area. This doesn't hold for the very bottom row of the uncropped
+//    picture of course.
+//  * we must clip the remaining pixels against the cropping area. The VP8Io
+//    struct must have the following fields set correctly before calling put():
+
+#define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
+
+// Finalize and transmit a complete row. Return false in case of user-abort.
 int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) {
+  int ok = 1;
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int first_row = (dec->mb_y_ == 0);
-  const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1);
-  uint8_t* const ydst = dec->cache_y_ - ysize;
-  uint8_t* const udst = dec->cache_u_ - uvsize;
-  uint8_t* const vdst = dec->cache_v_ - uvsize;
-  if (dec->filter_type_ > 0) {
-    int mb_x;
-    for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
-      DoFilter(dec, mb_x, dec->mb_y_);
-    }
+  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
+  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
+  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
+  const int first_row = (ctx->mb_y_ == 0);
+  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+
+  if (ctx->filter_row_) {
+    FilterRow(dec);
  }
+
  if (io->put) {
-    int y_start = dec->mb_y_ * 16;
-    int y_end = y_start + 16;
    if (!first_row) {
      y_start -= extra_y_rows;
      io->y = ydst;
      io->u = udst;
      io->v = vdst;
    } else {
-      io->y = dec->cache_y_;
-      io->u = dec->cache_u_;
-      io->v = dec->cache_v_;
+      io->y = dec->cache_y_ + y_offset;
+      io->u = dec->cache_u_ + uv_offset;
+      io->v = dec->cache_v_ + uv_offset;
    }
+
    if (!last_row) {
      y_end -= extra_y_rows;
    }
-    if (y_end > io->height) {
-      y_end = io->height;
+    if (y_end > io->crop_bottom) {
+      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    }
-    io->mb_y = y_start;
-    io->mb_h = y_end - y_start;
-    if (!io->put(io)) {
-      return 0;
+    io->a = NULL;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (dec->alpha_data_) {
+      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
+      if (io->a == NULL) {
+        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                           "Could not decode alpha data.");
+      }
+    }
+#endif
+    if (y_start < io->crop_top) {
+      const int delta_y = io->crop_top - y_start;
+      y_start = io->crop_top;
+      assert(!(delta_y & 1));
+      io->y += dec->cache_y_stride_ * delta_y;
+      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
+      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
+      if (io->a) {
+        io->a += io->width * delta_y;
+      }
+    }
+    if (y_start < y_end) {
+      io->y += io->crop_left;
+      io->u += io->crop_left >> 1;
+      io->v += io->crop_left >> 1;
+      if (io->a) {
+        io->a += io->crop_left;
+      }
+      io->mb_y = y_start - io->crop_top;
+      io->mb_w = io->crop_right - io->crop_left;
+      io->mb_h = y_end - y_start;
+      ok = io->put(io);
    }
  }
-    // rotate top samples
-  if (!last_row) {
-    memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize);
-    memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize);
-    memcpy(vdst, vdst + 8 * dec->cache_uv_stride_, uvsize);
+  // rotate top samples if needed
+  if (ctx->id_ + 1 == dec->num_caches_) {
+    if (!last_row) {
+      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
+      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
+      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
+    }
  }
-  return 1;
+
+  return ok;
 }

-//-----------------------------------------------------------------------------
+#undef MACROBLOCK_VPOS
+
+//------------------------------------------------------------------------------
+
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  if (!dec->use_threads_) {
+    // ctx->id_ and ctx->f_info_ are already set
+    ctx->mb_y_ = dec->mb_y_;
+    ctx->filter_row_ = dec->filter_row_;
+    ok = VP8FinishRow(dec, io);
+  } else {
+    WebPWorker* const worker = &dec->worker_;
+    // Finish previous job *before* updating context
+    ok &= WebPWorkerSync(worker);
+    assert(worker->status_ == OK);
+    if (ok) {   // spawn a new deblocking/output job
+      ctx->io_ = *io;
+      ctx->id_ = dec->cache_id_;
+      ctx->mb_y_ = dec->mb_y_;
+      ctx->filter_row_ = dec->filter_row_;
+      if (ctx->filter_row_) {    // just swap filter info
+        VP8FInfo* const tmp = ctx->f_info_;
+        ctx->f_info_ = dec->f_info_;
+        dec->f_info_ = tmp;
+      }
+      WebPWorkerLaunch(worker);
+      if (++dec->cache_id_ == dec->num_caches_) {
+        dec->cache_id_ = 0;
+      }
+    }
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Finish setting up the decoding parameter once user's setup() is called.
+
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
+  // Call setup() first. This may trigger additional decoding features on 'io'.
+  // Note: Afterward, we must call teardown() not matter what.
+  if (io->setup && !io->setup(io)) {
+    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
+    return dec->status_;
+  }
+
+  // Disable filtering per user request
+  if (io->bypass_filtering) {
+    dec->filter_type_ = 0;
+  }
+  // TODO(skal): filter type / strength / sharpness forcing
+
+  // Define the area where we can skip in-loop filtering, in case of cropping.
+  //
+  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // and filters one. It doesn't filter the chroma samples. Hence, we can
+  // avoid doing the in-loop filtering before crop_top/crop_left position.
+  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
+  // Means: there's a dependency chain that goes all the way up to the
+  // top-left corner of the picture (MB #0). We must filter all the previous
+  // macroblocks.
+  // TODO(skal): add an 'approximate_decoding' option, that won't produce
+  // a 1:1 bit-exactness for complex filtering?
+  {
+    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
+    if (dec->filter_type_ == 2) {
+      // For complex filter, we need to preserve the dependency chain.
+      dec->tl_mb_x_ = 0;
+      dec->tl_mb_y_ = 0;
+    } else {
+      // For simple filter, we can filter only the cropped region.
+      dec->tl_mb_y_ = io->crop_top >> 4;
+      dec->tl_mb_x_ = io->crop_left >> 4;
+    }
+    // We need some 'extra' pixels on the right/bottom.
+    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
+    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
+    if (dec->br_mb_x_ > dec->mb_w_) {
+      dec->br_mb_x_ = dec->mb_w_;
+    }
+    if (dec->br_mb_y_ > dec->mb_h_) {
+      dec->br_mb_y_ = dec->mb_h_;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  if (dec->use_threads_) {
+    ok = WebPWorkerSync(&dec->worker_);
+  }
+
+  if (io->teardown) {
+    io->teardown(io);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
 // Main reconstruction function.

 static const int kScan[16] = {
@@ -358,7 +605,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
        uint8_t* const dst = y_dst + kScan[n];
        VP8PredLuma4[dec->imodes_[n]](dst);
        if (dec->non_zero_ac_ & (1 << n)) {
-          VP8Transform(coeffs + n * 16, dst);
+          VP8Transform(coeffs + n * 16, dst, 0);
        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
          VP8TransformDC(coeffs + n * 16, dst);
        }
@@ -370,7 +617,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
        for (n = 0; n < 16; n++) {
          uint8_t* const dst = y_dst + kScan[n];
          if (dec->non_zero_ac_ & (1 << n)) {
-            VP8Transform(coeffs + n * 16, dst);
+            VP8Transform(coeffs + n * 16, dst, 0);
          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
            VP8TransformDC(coeffs + n * 16, dst);
          }
@@ -410,7 +657,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/dec/idec.c
+++ b/src/dec/idec.c
@@ -15,15 +15,11 @@

 #include "webpi.h"
 #include "vp8i.h"
-#include "yuv.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#define RIFF_HEADER_SIZE 20
-#define VP8_HEADER_SIZE 10
-#define WEBP_HEADER_SIZE (RIFF_HEADER_SIZE + VP8_HEADER_SIZE)
 #define CHUNK_SIZE 4096
 #define MAX_MB_SIZE 4096

@@ -32,14 +28,20 @@ extern "C" {

 // Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
 // If there is any error the decoder goes into state ERROR.
-typedef enum { STATE_HEADER = 0, STATE_PARTS0 = 1,
-               STATE_DATA = 2, STATE_DONE = 3,
-               STATE_ERROR = 4
+typedef enum {
+  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_VP8_PARTS0,
+  STATE_VP8_DATA,
+  STATE_DONE,
+  STATE_ERROR
 } DecState;

 // Operating state for the MemBuffer
-typedef enum { MEM_MODE_NONE = 0,
-               MEM_MODE_APPEND, MEM_MODE_MAP
+typedef enum {
+  MEM_MODE_NONE = 0,
+  MEM_MODE_APPEND,
+  MEM_MODE_MAP
 } MemBufferMode;

 // storage for partition #0 and partial data (in a rolling fashion)
@@ -56,12 +58,13 @@ typedef struct {

 struct WebPIDecoder {
  DecState state_;         // current decoding state
-  int w_, h_;              // width and height
  WebPDecParams params_;   // Params to store output info
  VP8Decoder* dec_;
  VP8Io io_;

-  MemBuffer mem_;          // memory buffer
+  MemBuffer mem_;          // input memory buffer.
+  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
+  uint32_t vp8_size_;      // VP8 size extracted from VP8 Header.
 };

 // MB context to restore in case VP8DecodeMB() fails
@@ -229,43 +232,63 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,

 //------------------------------------------------------------------------------

-static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) {
+static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
+  if (idec->state_ == STATE_VP8_DATA) {
+    VP8Io* const io = &idec->io_;
+    if (io->teardown) {
+      io->teardown(io);
+    }
+  }
  idec->state_ = STATE_ERROR;
  return error;
 }

-// Header
-static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) {
-  int width, height;
-  uint32_t curr_size, riff_header_size, bits;
-  WebPDecParams* params = &idec->params_;
-  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+static void ChangeState(WebPIDecoder* const idec, DecState new_state,
+                        uint32_t consumed_bytes) {
+  idec->state_ = new_state;
+  idec->mem_.start_ += consumed_bytes;
+  assert(idec->mem_.start_ <= idec->mem_.end_);
+}

-  if (MemDataSize(&idec->mem_) < WEBP_HEADER_SIZE) {
+// Headers
+static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
+  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  uint32_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t vp8_size;
+  uint32_t bytes_skipped;
+  VP8StatusCode status;
+
+  status = WebPParseHeaders(&data, &curr_size, &vp8_size, &bytes_skipped);
+  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
+  } else if (status == VP8_STATUS_OK) {
+    idec->vp8_size_ = vp8_size;
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, bytes_skipped);
+    return VP8_STATUS_OK;  // We have skipped all pre-VP8 chunks.
+  } else {
+    return IDecError(idec, status);
+  }
+}
+
+static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
+  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  const uint32_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t bits;
+
+  if (curr_size < VP8_FRAME_HEADER_SIZE) {
+    // Not enough data bytes to extract VP8 Frame Header.
    return VP8_STATUS_SUSPENDED;
  }
-
-  if (!WebPInitDecParams(data, idec->mem_.end_, &width, &height, params)) {
+  if (!VP8GetInfo(data, curr_size, idec->vp8_size_, NULL, NULL, NULL)) {
    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
  }

-  // Validate and Skip over RIFF header
-  curr_size = MemDataSize(&idec->mem_);
-  if (!WebPCheckRIFFHeader(&data, &curr_size)) {
-    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
-  }
-  riff_header_size = idec->mem_.end_ - curr_size;
  bits = data[0] | (data[1] << 8) | (data[2] << 16);
+  idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;

-  idec->mem_.part0_size_ = (bits >> 5) + VP8_HEADER_SIZE;
-  idec->mem_.start_ += riff_header_size;
-  assert(idec->mem_.start_ <= idec->mem_.end_);
-
-  idec->w_ = width;
-  idec->h_ = height;
-  idec->io_.data_size -= riff_header_size;
+  idec->io_.data_size = curr_size;
  idec->io_.data = data;
-  idec->state_ = STATE_PARTS0;
+  idec->state_ = STATE_VP8_PARTS0;
  return VP8_STATUS_OK;
 }

@@ -298,14 +321,13 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
  VP8Decoder* const dec = idec->dec_;
  VP8Io* const io = &idec->io_;
  const WebPDecParams* const params = &idec->params_;
-  const WEBP_CSP_MODE mode = params->mode;
+  WebPDecBuffer* const output = params->output;

  // Wait till we have enough data for the whole partition #0
  if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
    return VP8_STATUS_SUSPENDED;
  }

-  io->opaque = &idec->params_;
  if (!VP8GetHeaders(dec, io)) {
    const VP8StatusCode status = dec->status_;
    if (status == VP8_STATUS_SUSPENDED ||
@@ -316,36 +338,35 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
    return IDecError(idec, status);
  }

-  if (!WebPCheckDecParams(io, params)) {
-    return IDecError(idec, VP8_STATUS_INVALID_PARAM);
+  // Allocate/Verify output buffer now
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
  }

-  if (mode != MODE_YUV) {
-    VP8YUVInit();
-  }
-
-  // allocate memory and prepare everything.
-  if (!VP8InitFrame(dec, io)) {
-    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
-  }
-  if (io->setup && !io->setup(io)) {
-    return IDecError(idec, VP8_STATUS_USER_ABORT);
-  }
-
-  // disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
-
  if (!CopyParts0Data(idec)) {
    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
  }

-  idec->state_ = STATE_DATA;
+  // Finish setting up the decoding parameters. Will call io->setup().
+  if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  // Note: past this point, teardown() must always be called
+  // in case of error.
+  idec->state_ = STATE_VP8_DATA;
+  // Allocate memory and prepare everything.
+  if (!VP8InitFrame(dec, io)) {
+    return IDecError(idec, dec->status_);
+  }
  return VP8_STATUS_OK;
 }

 // Remaining partitions
 static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
-  VP8BitReader*  br;
+  VP8BitReader* br;
  VP8Decoder* const dec = idec->dec_;
  VP8Io* const io = &idec->io_;

@@ -355,12 +376,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
    if (dec->mb_x_ == 0) {
-      VP8MB* const left = dec->mb_info_ - 1;
-      left->nz_ = 0;
-      left->dc_nz_ = 0;
-      memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+      VP8InitScanline(dec);
    }
-
    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
      MBContext context;
      SaveContext(dec, token_br, &context);
@@ -383,14 +400,14 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
        assert(idec->mem_.start_ <= idec->mem_.end_);
      }
    }
-    if (!VP8FinishRow(dec, io)) {
+    if (!VP8ProcessRow(dec, io)) {
      return IDecError(idec, VP8_STATUS_USER_ABORT);
    }
    dec->mb_x_ = 0;
  }
-
-  if (io->teardown) {
-    io->teardown(io);
+  // Synchronize the thread and check for errors.
+  if (!VP8ExitCritical(dec, io)) {
+    return IDecError(idec, VP8_STATUS_USER_ABORT);
  }
  dec->ready_ = 0;
  idec->state_ = STATE_DONE;
@@ -403,14 +420,17 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
  VP8StatusCode status = VP8_STATUS_SUSPENDED;
  assert(idec->dec_);

-  if (idec->state_ == STATE_HEADER) {
-    status = DecodeHeader(idec);
+  if (idec->state_ == STATE_PRE_VP8) {
+    status = DecodeWebPHeaders(idec);
  }
-  if (idec->state_ == STATE_PARTS0) {
+  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+    status = DecodeVP8FrameHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8_PARTS0) {
    status = DecodePartition0(idec);
  }
-  if (idec->state_ == STATE_DATA) {
-    return DecodeRemaining(idec);
+  if (idec->state_ == STATE_VP8_DATA) {
+    status = DecodeRemaining(idec);
  }
  return status;
 }
@@ -418,9 +438,11 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Public functions

-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) {
  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder));
-  if (!idec) return NULL;
+  if (idec == NULL) {
+    return NULL;
+  }

  idec->dec_ = VP8New();
  if (idec->dec_ == NULL) {
@@ -428,53 +450,97 @@ WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
    return NULL;
  }

-  idec->state_ = STATE_HEADER;
-  idec->params_.mode = mode;
+  idec->state_ = STATE_PRE_VP8;

  InitMemBuffer(&idec->mem_);
+  WebPInitDecBuffer(&idec->output_);
  VP8InitIo(&idec->io_);
-  WebPInitCustomIo(&idec->io_);
+
+  WebPResetDecParams(&idec->params_);
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
+
+#ifdef WEBP_USE_THREAD
+  idec->dec_->use_threads_ = idec->params_.options &&
+                             (idec->params_.options->use_threads > 0);
+#else
+  idec->dec_->use_threads_ = 0;
+#endif
+  idec->vp8_size_ = 0;
+
+  return idec;
+}
+
+WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size,
+                          WebPDecoderConfig* const config) {
+  WebPIDecoder* idec;
+
+  // Parse the bitstream's features, if requested:
+  if (data != NULL && data_size > 0 && config != NULL) {
+    if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
+      return NULL;
+    }
+  }
+  // Create an instance of the incremental decoder
+  idec = WebPINewDecoder(config ? &config->output : NULL);
+  if (!idec) {
+    return NULL;
+  }
+  // Finish initialization
+  if (config != NULL) {
+    idec->params_.options = &config->options;
+  }
  return idec;
 }

 void WebPIDelete(WebPIDecoder* const idec) {
  if (!idec) return;
  VP8Delete(idec->dec_);
-  WebPClearDecParams(&idec->params_);
  ClearMemBuffer(&idec->mem_);
+  WebPFreeDecBuffer(&idec->output_);
  free(idec);
 }

 //------------------------------------------------------------------------------
+// Wrapper toward WebPINewDecoder
+
+WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) {
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  return idec;
+}

 WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
                          int output_buffer_size, int output_stride) {
  WebPIDecoder* idec;
-  if (mode == MODE_YUV) return NULL;
-  idec = WebPINew(mode);
-  if (idec == NULL) return NULL;
-  idec->params_.output = output_buffer;
-  idec->params_.stride = output_stride;
-  idec->params_.output_size = output_buffer_size;
-  idec->params_.external_buffer = 1;
+  if (mode >= MODE_YUV) return NULL;
+  idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = mode;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.RGBA.rgba = output_buffer;
+  idec->output_.u.RGBA.stride = output_stride;
+  idec->output_.u.RGBA.size = output_buffer_size;
  return idec;
 }

 WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
                          uint8_t* u, int u_size, int u_stride,
                          uint8_t* v, int v_size, int v_stride) {
-  WebPIDecoder* idec = WebPINew(MODE_YUV);
-  if (idec == NULL) return NULL;
-  idec->params_.output = luma;
-  idec->params_.stride = luma_stride;
-  idec->params_.output_size = luma_size;
-  idec->params_.u = u;
-  idec->params_.u_stride = u_stride;
-  idec->params_.output_u_size = u_size;
-  idec->params_.v = v;
-  idec->params_.v_stride = v_stride;
-  idec->params_.output_v_size = v_size;
-  idec->params_.external_buffer = 1;
+  WebPIDecoder* const idec = WebPINewDecoder(NULL);
+  if (!idec) return NULL;
+  idec->output_.colorspace = MODE_YUV;
+  idec->output_.is_external_memory = 1;
+  idec->output_.u.YUVA.y = luma;
+  idec->output_.u.YUVA.y_stride = luma_stride;
+  idec->output_.u.YUVA.y_size = luma_size;
+  idec->output_.u.YUVA.u = u;
+  idec->output_.u.YUVA.u_stride = u_stride;
+  idec->output_.u.YUVA.u_size = u_size;
+  idec->output_.u.YUVA.v = v;
+  idec->output_.u.YUVA.v_stride = v_stride;
+  idec->output_.u.YUVA.v_size = v_size;
  return idec;
 }

@@ -538,38 +604,81 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,

 //------------------------------------------------------------------------------

-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
-                        int* width, int* height, int* stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode != MODE_RGB ||
-      idec->state_ <= STATE_PARTS0) {
+static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
+  if (!idec || !idec->dec_ || idec->state_ <= STATE_VP8_PARTS0) {
    return NULL;
  }
-
-  if (last_y) *last_y = idec->params_.last_y;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-
  return idec->params_.output;
 }

-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int *last_y,
-                        uint8_t** u, uint8_t** v, int* width, int* height,
-                        int *stride, int* uv_stride) {
-  if (!idec || !idec->dec_ || idec->params_.mode != MODE_YUV ||
-      idec->state_ <= STATE_PARTS0) {
+const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* const idec,
+                                      int* const left, int* const top,
+                                      int* const width, int* const height) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (left) *left = 0;
+  if (top) *top = 0;
+  // TODO(skal): later include handling of rotations.
+  if (src) {
+    if (width) *width = src->width;
+    if (height) *height = idec->params_.last_y;
+  } else {
+    if (width) *width = 0;
+    if (height) *height = 0;
+  }
+  return src;
+}
+
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y,
+                        int* width, int* height, int* stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace >= MODE_YUV) {
    return NULL;
  }

  if (last_y) *last_y = idec->params_.last_y;
-  if (u) *u = idec->params_.u;
-  if (v) *v = idec->params_.v;
-  if (width) *width = idec->w_;
-  if (height) *height = idec->h_;
-  if (stride) *stride = idec->params_.stride;
-  if (uv_stride) *uv_stride = idec->params_.u_stride;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.RGBA.stride;

-  return idec->params_.output;
+  return src->u.RGBA.rgba;
+}
+
+uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
+                        uint8_t** u, uint8_t** v,
+                        int* width, int* height, int *stride, int* uv_stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (!src) return NULL;
+  if (src->colorspace < MODE_YUV) {
+    return NULL;
+  }
+
+  if (last_y) *last_y = idec->params_.last_y;
+  if (u) *u = src->u.YUVA.u;
+  if (v) *v = src->u.YUVA.v;
+  if (width) *width = src->width;
+  if (height) *height = src->height;
+  if (stride) *stride = src->u.YUVA.y_stride;
+  if (uv_stride) *uv_stride = src->u.YUVA.u_stride;
+
+  return src->u.YUVA.y;
+}
+
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data) {
+  if (!idec || !idec->dec_ || idec->state_ > STATE_PRE_VP8) {
+    return 0;
+  }
+
+  idec->io_.put = put;
+  idec->io_.setup = setup;
+  idec->io_.teardown = teardown;
+  idec->io_.opaque = user_data;
+
+  return 1;
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/dec/io.c
+++ b/src/dec/io.c
@@ -0,0 +1,668 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../dec/vp8i.h"
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Main YUV<->RGB conversion functions
+
+static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  const int uv_w = (mb_w + 1) / 2;
+  int j;
+  for (j = 0; j < mb_h; ++j) {
+    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
+  }
+  for (j = 0; j < (mb_h + 1) / 2; ++j) {
+    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
+    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
+  }
+  return io->mb_h;
+}
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h - 1;
+  int j;
+  for (j = 0; j < last; j += 2) {
+    sample(y_src, y_src + io->y_stride, u_src, v_src,
+           dst, dst + buf->stride, mb_w);
+    y_src += 2 * io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += 2 * buf->stride;
+  }
+  if (j == last) {  // Just do the last line twice
+    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
+  }
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// YUV444 -> RGB conversion
+
+#if 0   // TODO(skal): this is for future rescaling.
+static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h;
+  int j;
+  for (j = 0; j < last; ++j) {
+    convert(y_src, u_src, v_src, dst, mb_w);
+    y_src += io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += buf->stride;
+  }
+  return io->mb_h;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const WebPUpsampleLinePairFunc upsample =
+      io->a ? WebPUpsamplersKeepAlpha[p->output->colorspace]
+            : WebPUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    // Warning! Don't overwrite the alpha values (if any), as they
+    // are not lagging one line behind but are already written.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    num_lines_out++;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+              dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int j;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(dst, alpha, mb_w * sizeof(*dst));
+      alpha += io->width;
+      dst += buf->a_stride;
+    }
+  }
+  return 0;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int i, j;
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* alpha = io->a;
+  if (alpha) {
+    for (j = 0; j < mb_h; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        dst[4 * i + 3] = alpha[i];
+      }
+      alpha += io->width;
+      dst += buf->stride;
+    }
+  }
+  return 0;
+}
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+// TODO(skal): start a common library for encoder and decoder, and factorize
+// this code in.
+
+#define RFIX 30
+#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+
+static void InitRescaler(WebPRescaler* const wrk,
+                         int src_width, int src_height,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int x_add, int x_sub, int y_add, int y_sub,
+                         int32_t* work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << RFIX) / x_sub;
+  wrk->fy_scale = (1 << RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + dst_width;
+}
+
+static inline void ImportRow(const uint8_t* const src,
+                             WebPRescaler* const wrk) {
+  int x_in = 0;
+  int x_out;
+  int accum = 0;
+  if (!wrk->x_expand) {
+    int sum = 0;
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      accum += wrk->x_add;
+      for (; accum > 0; accum -= wrk->x_sub) {
+        sum += src[x_in++];
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in++];
+        const int32_t frac = base * (-accum);
+        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = MULT(frac, wrk->fx_scale);
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[0], right = src[0];
+    for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+      if (accum < 0) {
+        left = right;
+        right = src[++x_in];
+        accum += wrk->x_add;
+      }
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      accum -= wrk->x_sub;
+    }
+  }
+  // Accumulate the new row's contribution
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    wrk->irow[x_out] += wrk->frow[x_out];
+  }
+}
+
+static void ExportRow(WebPRescaler* const wrk) {
+  int x_out;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(wrk->y_accum <= 0);
+  for (x_out = 0; x_out < wrk->dst_width; ++x_out) {
+    const int frac = MULT(wrk->frow[x_out], yscale);
+    const int v = (int)MULT(wrk->irow[x_out] - frac, wrk->fxy_scale);
+    wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+    wrk->irow[x_out] = frac;   // new fractional start
+  }
+  wrk->y_accum += wrk->y_add;
+  wrk->dst += wrk->dst_stride;
+}
+
+#undef MULT
+#undef RFIX
+
+//------------------------------------------------------------------------------
+// YUV rescaling (no final RGB conversion needed)
+
+static int Rescale(const uint8_t* src, int src_stride,
+                   int new_lines, WebPRescaler* const wrk) {
+  int num_lines_out = 0;
+  while (new_lines-- > 0) {    // import new contribution of one source row.
+    ImportRow(src, wrk);
+    src += src_stride;
+    wrk->y_accum -= wrk->y_sub;
+    while (wrk->y_accum <= 0) {      // emit output row(s)
+      ExportRow(wrk);
+      num_lines_out++;
+    }
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+  }
+  return 0;
+}
+
+static int IsAlphaMode(WEBP_CSP_MODE mode) {
+  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
+          mode == MODE_RGBA_4444 || mode == MODE_YUVA);
+}
+
+static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_out_width  = (out_width + 1) >> 1;
+  const int uv_out_height = (out_height + 1) >> 1;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
+  size_t tmp_size;
+  int32_t* work;
+
+  tmp_size = work_size + 2 * uv_work_size;
+  if (has_alpha) {
+    tmp_size += work_size;
+  }
+  p->memory = calloc(1, tmp_size * sizeof(*work));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               buf->y, out_width, out_height, buf->y_stride,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               buf->u, uv_out_width, uv_out_height, buf->u_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               buf->v, uv_out_width, uv_out_height, buf->v_stride,
+               uv_in_width, uv_out_width,
+               uv_in_height, uv_out_height,
+               work + work_size + uv_work_size);
+  p->emit = EmitRescaledYUV;
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 buf->a, out_width, out_height, buf->a_stride,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + work_size + 2 * uv_work_size);
+    p->emit_alpha = EmitRescaledAlphaYUV;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+// import new contributions until one row is ready to be output, or all input
+// is consumed.
+static int Import(const uint8_t* src, int src_stride,
+                  int new_lines, WebPRescaler* const wrk) {
+  int num_lines_in = 0;
+  while (num_lines_in < new_lines && wrk->y_accum > 0) {
+    ImportRow(src, wrk);
+    src += src_stride;
+    ++num_lines_in;
+    wrk->y_accum -= wrk->y_sub;
+  }
+  return num_lines_in;
+}
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const WebPYUV444Converter convert =
+      WebPYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    ExportRow(&p->scaler_y);
+    ExportRow(&p->scaler_u);
+    ExportRow(&p->scaler_v);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride,
+                                  mb_h - j, &p->scaler_y);
+    const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_u);
+    const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride,
+                                  uv_mb_h - uv_j, &p->scaler_v);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  while (p->scaler_a.y_accum <= 0) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    ExportRow(&p->scaler_a);
+    for (i = 0; i < p->scaler_a.dst_width; ++i) {
+      dst[4 * i + 3] = p->scaler_a.dst[i];
+    }
+    dst += buf->stride;
+    num_lines_out++;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a) {
+    int j = 0, pos = 0;
+    while (j < io->mb_h) {
+      j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a);
+      pos += ExportAlpha(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = IsAlphaMode(p->output->colorspace);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  p->memory =
+      calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  InitRescaler(&p->scaler_y, io->mb_w, io->mb_h,
+               tmp + 0 * out_width, out_width, out_height, 0,
+               io->mb_w, out_width, io->mb_h, out_height,
+               work + 0 * work_size);
+  InitRescaler(&p->scaler_u, uv_in_width, uv_in_height,
+               tmp + 1 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 1 * work_size);
+  InitRescaler(&p->scaler_v, uv_in_width, uv_in_height,
+               tmp + 2 * out_width, out_width, out_height, 0,
+               io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+               work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+
+  if (has_alpha) {
+    InitRescaler(&p->scaler_a, io->mb_w, io->mb_h,
+                 tmp + 3 * out_width, out_width, out_height, 0,
+                 io->mb_w, out_width, io->mb_h, out_height,
+                 work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+// Setup crop_xxx fields, mb_w and mb_h
+static int InitFromOptions(const WebPDecoderOptions* const options,
+                           VP8Io* const io) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    // TODO(skal): take colorspace into account. Don't assume YUV420.
+    x = options->crop_left & ~1;
+    y = options->crop_top & ~1;
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int is_rgb = (p->output->colorspace < MODE_YUV);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  if (!InitFromOptions(p->options, io)) {
+    return 0;
+  }
+
+  if (io->use_scaling) {
+    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+#ifdef FANCY_UPSAMPLING
+      if (io->fancy_upsampling) {
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = malloc(io->mb_w + 2 * uv_width);
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+        WebPInitUpsamplers();
+      }
+#endif
+    } else {
+      p->emit = EmitYUV;
+    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (IsAlphaMode(p->output->colorspace)) {
+      // We need transparency output
+      p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV;
+    }
+#endif
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  free(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/layer.c
+++ b/src/dec/layer.c
@@ -0,0 +1,34 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Enhancement layer (for YUV444/422)
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+
+int VP8DecodeLayer(VP8Decoder* const dec) {
+  assert(dec);
+  assert(dec->layer_data_size_ > 0);
+  (void)dec;
+
+  // TODO: handle enhancement layer here.
+
+  return 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dec/quant.c
+++ b/src/dec/quant.c
@@ -58,7 +58,7 @@ static const uint16_t kAcTable[128] = {
  249, 254, 259, 264, 269, 274, 279, 284
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 9.6

 void VP8ParseQuant(VP8Decoder* const dec) {
@@ -104,7 +104,7 @@ void VP8ParseQuant(VP8Decoder* const dec) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/dec/tree.c
+++ b/src/dec/tree.c
@@ -65,7 +65,7 @@ static const int8_t kMVRef4[6] = {
 };
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Default probabilities

 // Inter
@@ -385,7 +385,7 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 13

 static const uint8_t
--- a/src/dec/vp8.c
+++ b/src/dec/vp8.c
@@ -11,18 +11,19 @@

 #include <stdlib.h>
 #include "vp8i.h"
+#include "webpi.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int WebPGetDecoderVersion(void) {
  return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Decoder

 static void SetOk(VP8Decoder* const dec) {
@@ -43,6 +44,7 @@ VP8Decoder* VP8New(void) {
  VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder));
  if (dec) {
    SetOk(dec);
+    WebPWorkerInit(&dec->worker_);
    dec->ready_ = 0;
  }
  return dec;
@@ -74,7 +76,56 @@ int VP8SetError(VP8Decoder* const dec,
  return 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size,
+               int* width, int* height, int* has_alpha) {
+  if (data_size < 10) {
+    return 0;         // not enough data
+  }
+  // check signature
+  if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) {
+    return 0;         // Wrong signature.
+  } else {
+    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
+    const int key_frame = !(bits & 1);
+    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
+    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
+
+    if (has_alpha) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      if (data_size < 11) return 0;
+      *has_alpha = !!(data[10] & 0x80);    // the colorspace_ bit
+#else
+      *has_alpha = 0;
+#endif
+    }
+    if (!key_frame) {   // Not a keyframe.
+      return 0;
+    }
+
+    if (((bits >> 1) & 7) > 3) {
+      return 0;         // unknown profile
+    }
+    if (!((bits >> 4) & 1)) {
+      return 0;         // first frame is invisible!
+    }
+    if (((bits >> 5)) >= chunk_size) {  // partition_length
+      return 0;         // inconsistent size information.
+    }
+
+    if (width) {
+      *width = w;
+    }
+    if (height) {
+      *height = h;
+    }
+
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
 // Header parsing

 static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
@@ -194,14 +245,12 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
  return !br->eof_;
 }

-static inline uint32_t get_le32(const uint8_t* const data) {
-  return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24);
-}
-
 // Topmost call
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
-  uint8_t* buf;
+  const uint8_t* buf;
  uint32_t buf_size;
+  uint32_t vp8_chunk_size;
+  uint32_t bytes_skipped;
  VP8FrameHeader* frm_hdr;
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
@@ -216,41 +265,19 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
                       "null VP8Io passed to VP8GetHeaders()");
  }

-  buf = (uint8_t *)io->data;
+  buf = io->data;
  buf_size = io->data_size;
-  if (buf == NULL || buf_size <= 4) {
-    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                       "Not enough data to parse frame header");
+
+  // Process Pre-VP8 chunks.
+  status = WebPParseHeaders(&buf, &buf_size, &vp8_chunk_size, &bytes_skipped);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "Incorrect/incomplete header.");
  }

-  // Skip over valid RIFF headers
-  if (!memcmp(buf, "RIFF", 4)) {
-    uint32_t riff_size;
-    uint32_t chunk_size;
-    if (buf_size < 20 + 4) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 8, "WEBP", 4)) {   // wrong image file signature
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: WEBP signature not found.");
-    }
-    riff_size = get_le32(buf + 4);
-    if (riff_size < 12) {
-      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
-                         "RIFF: Truncated header.");
-    }
-    if (memcmp(buf + 12, "VP8 ", 4)) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Invalid compression format.");
-    }
-    chunk_size = get_le32(buf + 16);
-    if (chunk_size > riff_size - 12) {
-      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                         "RIFF: Inconsistent size information.");
-    }
-    buf += 20;
-    buf_size -= 20;
+  // Process the VP8 frame header.
+  if (buf_size < 4) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "Truncated header.");
  }

  // Paragraph 9.1
@@ -291,8 +318,17 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
    io->width = pic_hdr->width_;
    io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto

    VP8ResetProba(&dec->proba_);
    ResetSegmentHeader(&dec->segment_hdr_);
@@ -305,6 +341,10 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "bad partition length");
  }
+
+  dec->alpha_data_ = NULL;
+  dec->alpha_data_size_ = 0;
+
  br = &dec->br_;
  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
@@ -368,12 +408,42 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {

  VP8ParseProba(br, dec);

+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  // Extensions
+  if (dec->pic_hdr_.colorspace_) {
+    const size_t kTrailerSize = 8;
+    const uint8_t kTrailerMarker = 0x01;
+    const uint8_t* ext_buf = buf - kTrailerSize;
+    size_t size;
+
+    if (frm_hdr->partition_length_ < kTrailerSize ||
+        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
+ Error:
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "RIFF: Inconsistent extra information.");
+    }
+    // Alpha
+    size = (ext_buf[4] << 0) | (ext_buf[5] << 8) | (ext_buf[6] << 16);
+    if (frm_hdr->partition_length_ < size + kTrailerSize) {
+      goto Error;
+    }
+    dec->alpha_data_ = (size > 0) ? ext_buf - size : NULL;
+    dec->alpha_data_size_ = size;
+
+    // Layer
+    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
+    dec->layer_data_size_ = size;
+    dec->layer_data_ = NULL;  // will be set later
+    dec->layer_colorspace_ = ext_buf[3];
+  }
+#endif
+
  // sanitized state
  dec->ready_ = 1;
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)

 static const uint8_t kBands[16 + 1] = {
@@ -386,7 +456,7 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
-static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
@@ -422,7 +492,8 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
            if (!VP8GetBit(br, p[7])) {
              v = 5 + VP8GetBit(br, 159);
            } else {
-              v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145);
+              v = 7 + 2 * VP8GetBit(br, 165);
+              v += VP8GetBit(br, 145);
            }
          } else {
            const uint8_t* tab;
@@ -551,7 +622,7 @@ static void ParseResiduals(VP8Decoder* const dec,
 }
 #undef PACK

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main loop

 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
@@ -588,16 +659,21 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  return (!token_br->eof_);
 }

+void VP8InitScanline(VP8Decoder* const dec) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  left->nz_ = 0;
+  left->dc_nz_ = 0;
+  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+  dec->filter_row_ =
+    (dec->filter_type_ > 0) &&
+    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+}
+
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
-  for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
-    VP8MB* const left = dec->mb_info_ - 1;
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-
-    left->nz_ = 0;
-    left->dc_nz_ = 0;
-    memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-
+    VP8InitScanline(dec);
    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
@@ -608,11 +684,13 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
      // Store data and save block's filtering params
      VP8StoreBlock(dec);
    }
-    if (!VP8FinishRow(dec, io)) {
-      return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                         "Output aborted.");
+    if (!VP8ProcessRow(dec, io)) {
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
+  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+    return 0;
+  }

  // Finish
 #ifndef ONLY_KEYFRAME_CODE
@@ -621,11 +699,20 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  }
 #endif

+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (dec->layer_data_size_ > 0) {
+    if (!VP8DecodeLayer(dec)) {
+      return 0;
+    }
+  }
+#endif
+
  return 1;
 }

 // Main entry point
 int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 0;
  if (dec == NULL) {
    return 0;
  }
@@ -641,32 +728,22 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
  }
  assert(dec->ready_);

-  // will allocate memory and prepare everything.
-  if (!VP8InitFrame(dec, io)) {
-    VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
-                       "Allocation failed");
+  // Finish setting up the decoding parameter. Will call io->setup().
+  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
+  if (ok) {   // good to go.
+    // Will allocate memory and prepare everything.
+    if (ok) ok = VP8InitFrame(dec, io);
+
+    // Main decoding loop
+    if (ok) ok = ParseFrame(dec, io);
+
+    // Exit.
+    ok &= VP8ExitCritical(dec, io);
  }

-  if (io->setup && !io->setup(io)) {
+  if (!ok) {
    VP8Clear(dec);
-    return VP8SetError(dec, VP8_STATUS_USER_ABORT,
-                       "Frame setup failed");
-  }
-
-  // Disable filtering per user request (_after_ setup() is called)
-  if (io->bypass_filtering) dec->filter_type_ = 0;
-
-  // Main decoding loop
-  {
-    const int ret = ParseFrame(dec, io);
-    if (io->teardown) {
-      io->teardown(io);
-    }
-    if (!ret) {
-      VP8Clear(dec);
-      return 0;
-    }
+    return 0;
  }

  dec->ready_ = 0;
@@ -677,6 +754,9 @@ void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
+  if (dec->use_threads_) {
+    WebPWorkerEnd(&dec->worker_);
+  }
  if (dec->mem_) {
    free(dec->mem_);
  }
@@ -686,7 +766,7 @@ void VP8Clear(VP8Decoder* const dec) {
  dec->ready_ = 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/dec/vp8i.h
+++ b/src/dec/vp8i.h
@@ -13,19 +13,21 @@
 #define WEBP_DEC_VP8I_H_

 #include <string.h>     // for memcpy()
-#include "bits.h"
+#include "../utils/bit_reader.h"
+#include "../utils/thread.h"
+#include "../dsp/dsp.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Various defines and enums

 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 1
-#define DEC_REV_VERSION 2
+#define DEC_REV_VERSION 3

 #define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames

@@ -95,7 +97,7 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Headers

 typedef struct {
@@ -144,19 +146,19 @@ typedef struct {
  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
 } VP8FilterHeader;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Informations about the macroblocks.

-typedef struct {
-  // block type
-  uint8_t skip_:1;
-  // filter specs
-  uint8_t f_level_:6;      // filter strength: 0..63
-  uint8_t f_ilevel_:6;     // inner limit: 1..63
-  uint8_t f_inner_:1;      // do inner filtering?
-  // cbp
-  uint8_t nz_;        // non-zero AC/DC coeffs
-  uint8_t dc_nz_;     // non-zero DC coeffs
+typedef struct {  // filter specs
+  unsigned int f_level_:6;      // filter strength: 0..63
+  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  unsigned int f_inner_:1;      // do inner filtering?
+} VP8FInfo;
+
+typedef struct {  // used for syntax-parsing
+  unsigned int nz_;          // non-zero AC/DC coeffs
+  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  unsigned int skip_:1;      // block type
 } VP8MB;

 // Dequantization matrices
@@ -164,7 +166,16 @@ typedef struct {
  uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2];    // [DC / AC]
 } VP8QuantMatrix;

-//-----------------------------------------------------------------------------
+// Persistent information needed by the parallel processing
+typedef struct {
+  int id_;            // cache row to process (in [0..2])
+  int mb_y_;          // macroblock position of the row
+  int filter_row_;    // true if row-filtering is needed
+  VP8FInfo* f_info_;  // filter strengths
+  VP8Io io_;          // copy of the VP8Io to pass to put()
+} VP8ThreadContext;
+
+//------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user

 struct VP8Decoder {
@@ -181,9 +192,20 @@ struct VP8Decoder {
  VP8FilterHeader  filter_hdr_;
  VP8SegmentHeader segment_hdr_;

+  // Worker
+  WebPWorker worker_;
+  int use_threads_;    // use multi-thread
+  int cache_id_;       // current cache row
+  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
+  VP8ThreadContext thread_ctx_;  // Thread context
+
  // dimension, in macroblock units.
  int mb_w_, mb_h_;

+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
  // number of partitions.
  int num_parts_;
  // per-partition boolean decoders.
@@ -212,10 +234,11 @@ struct VP8Decoder {
  // Boundary data cache and persistent buffers.
  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
  uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t *y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t *u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each

-  VP8MB* mb_info_;       // contextual macroblock infos (mb_w_ + 1)
+  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;     // filter strength info
  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4

@@ -244,17 +267,35 @@ struct VP8Decoder {
  uint32_t non_zero_ac_;

  // Filtering side-info
-  int filter_type_;                       // 0=off, 1=simple, 2=complex
+  int filter_type_;                         // 0=off, 1=simple, 2=complex
+  int filter_row_;                          // per-row flag
  uint8_t filter_levels_[NUM_MB_SEGMENTS];  // precalculated per-segment
+
+  // extensions
+  const uint8_t* alpha_data_;   // compressed alpha data (if present)
+  size_t alpha_data_size_;
+  uint8_t* alpha_plane_;        // output
+
+  int layer_colorspace_;
+  const uint8_t* layer_data_;   // compressed layer data (if present)
+  size_t layer_data_size_;
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // internal functions. Not public.

 // in vp8.c
 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char * const msg);

+// Validates the VP8 data-header and retrieve basic header information viz width
+// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha
+// can be passed NULL.
+int VP8GetInfo(const uint8_t* data,
+               uint32_t data_size,    // data available so far
+               uint32_t chunk_size,   // total data size expect in the chunk
+               int *width, int *height, int *has_alpha);
+
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
@@ -267,59 +308,38 @@ void VP8ParseQuant(VP8Decoder* const dec);
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
+// Call io->setup() and finish setting up scan parameters.
+// After this call returns, one must always call VP8ExitCritical() with the
+// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
+// if ok, otherwise sets and returns the error status on *dec.
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
+// Must always be called in pair with VP8EnterCritical().
+// Returns false in case of error.
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+// Filter the decoded macroblock row (if needed)
+int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);   // multi threaded call
+// Process the last decoded row (filtering + output)
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // Store a block, along with filtering params
 void VP8StoreBlock(VP8Decoder* const dec);
 // Finalize and transmit a complete row. Return false in case of user-abort.
-int VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
+int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io);
+// To be called at the start of a new scanline, to initialize predictors.
+void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);

-// in dsp.c
-typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst);
-extern VP8Idct VP8Transform;
-extern VP8Idct VP8TransformUV;
-extern VP8Idct VP8TransformDC;
-extern VP8Idct VP8TransformDCUV;
-extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+// in alpha.c
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows);

-// *dst is the destination block, with stride BPS. Boundary samples are
-// assumed accessible when needed.
-typedef void (*VP8PredFunc)(uint8_t *dst);
-extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
-extern VP8PredFunc VP8PredLuma4[NUM_BMODES];
+// in layer.c
+int VP8DecodeLayer(VP8Decoder* const dec);

-void VP8DspInit(void);        // must be called before anything using the above
-void VP8DspInitTables(void);  // needs to be called no matter what.
-
-// simple filter (only for luma)
-typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
-extern VP8SimpleFilterFunc VP8SimpleVFilter16;
-extern VP8SimpleFilterFunc VP8SimpleHFilter16;
-extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
-extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
-
-// regular filter (on both macroblock edges and inner edges)
-typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
-                                  int thresh, int ithresh, int hev_t);
-typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
-                                    int thresh, int ithresh, int hev_t);
-// on outter edge
-extern VP8LumaFilterFunc VP8VFilter16;
-extern VP8LumaFilterFunc VP8HFilter16;
-extern VP8ChromaFilterFunc VP8VFilter8;
-extern VP8ChromaFilterFunc VP8HFilter8;
-
-// on inner edge
-extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
-extern VP8LumaFilterFunc VP8HFilter16i;
-extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
-extern VP8ChromaFilterFunc VP8HFilter8i;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif  // WEBP_DEC_VP8I_H_
+#endif  /* WEBP_DEC_VP8I_H_ */
--- a/src/dec/webp.c
+++ b/src/dec/webp.c
--- a/src/dec/webpi.h
+++ b/src/dec/webpi.h
@@ -9,55 +9,155 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)

-#ifndef WEBP_DEC_WEBPI_H
-#define WEBP_DEC_WEBPI_H
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#include "webp/decode_vp8.h"
+#include "../webp/decode_vp8.h"

-// Decoding output parameters.
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+
+// Structure use for on-the-fly rescaling
 typedef struct {
-  uint8_t* output;      // rgb(a) or luma
-  uint8_t *u, *v;       // chroma u/v
-  uint8_t *top_y, *top_u, *top_v;   // cache for the fancy upscaler
-  int stride;           // rgb(a) stride or luma stride
-  int u_stride;         // chroma-u stride
-  int v_stride;         // chroma-v stride
-  WEBP_CSP_MODE mode;   // rgb(a) or yuv
-  int last_y;           // coordinate of the line that was last output
-  int output_size;      // size of 'output' buffer
-  int output_u_size;    // size of 'u' buffer
-  int output_v_size;    // size of 'v' buffer
-  int external_buffer;  // If true, the output buffers are externally owned
-} WebPDecParams;
+  int x_expand;               // true if we're expanding in the x direction
+  int fy_scale, fx_scale;     // fixed-point scaling factor
+  int64_t fxy_scale;          // ''
+  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  int y_accum;                // vertical accumulator
+  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
+  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int src_width, src_height;  // source dimensions
+  int dst_width, dst_height;  // destination dimensions
+  uint8_t* dst;
+  int dst_stride;
+  int32_t* irow, *frow;       // work buffer
+} WebPRescaler;

-// If a RIFF container is detected, validate it and skip over it. Returns
-// VP8 bit-stream size if RIFF header is valid else returns 0
-uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr,
-                             uint32_t *data_size_ptr);
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling

-// Initializes VP8Io with custom setup, io and teardown functions
-void WebPInitCustomIo(VP8Io* const io);
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;               // overall scratch memory for the output work.
+  OutputFunc emit;            // output RGB or YUV samples
+  OutputFunc emit_alpha;      // output alpha channel
+};

-// Initializes params_out by allocating output buffer and setting the
-// stride information. It also outputs width and height information of
-// the WebP image. Returns 1 if succeeds.
-int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width,
-                      int* height, WebPDecParams* const params_out);
+// Should be called first, before any use of the WebPDecParams object.
+void WebPResetDecParams(WebPDecParams* const params);

-// Verifies various size configurations (e.g stride >= width, specified
-// output size <= stride * height etc.). Returns 0 if checks fail.
-int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params);
+//------------------------------------------------------------------------------
+// Header parsing helpers

-// Deallocate memory allocated by WebPInitDecParams() and reset the
-// WebPDecParams object.
-void WebPClearDecParams(WebPDecParams* params);
+#define TAG_SIZE 4
+#define CHUNK_HEADER_SIZE 8
+#define RIFF_HEADER_SIZE 12
+#define FRAME_CHUNK_SIZE 20
+#define LOOP_CHUNK_SIZE 4
+#define TILE_CHUNK_SIZE 8
+#define VP8X_CHUNK_SIZE 12
+#define VP8_FRAME_HEADER_SIZE 10  // Size of the frame header within VP8 data.
+
+// Validates the RIFF container (if detected) and skips over it.
+// If a RIFF container is detected,
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
+//         VP8_STATUS_OK otherwise.
+// In case there are not enough bytes (partial RIFF container), return 0 for
+// riff_size. Else return the riff_size extracted from the header.
+VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size,
+                            uint32_t* riff_size);
+
+// Validates the VP8X Header and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
+// that are skipped; also Width, Height & Flags are set to the corresponding
+// fields extracted from the VP8X chunk.
+VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size,
+                            uint32_t* bytes_skipped,
+                            int* width, int* height, uint32_t* flags);
+
+// Skips to the next VP8 chunk header in the data given the size of the RIFF
+// chunk 'riff_size'.
+// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
+// that are skipped.
+VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size,
+                                      uint32_t riff_size,
+                                      uint32_t* bytes_skipped);
+
+// Validates the VP8 Header ("VP8 nnnn") and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (vp8_chunk_size greater than
+//         riff_size) VP8 header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes
+// that are skipped and vp8_chunk_size is set to the corresponding size
+// extracted from the VP8 chunk header.
+// For a partial VP8 chunk, vp8_chunk_size is set to 0.
+VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size,
+                                 uint32_t riff_size, uint32_t* bytes_skipped,
+                                 uint32_t* vp8_chunk_size);
+
+// Skips over all valid chunks prior to the first VP8 frame header.
+// Returns VP8_STATUS_OK on success,
+//         VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and
+//         VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data.
+// Also, data, data_size, vp8_size & bytes_skipped are updated appropriately
+// on success, where
+// vp8_size is the size of VP8 chunk data (extracted from VP8 chunk header) and
+// bytes_skipped is set to the total number of bytes that are skipped.
+VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size,
+                               uint32_t* vp8_size, uint32_t* bytes_skipped);
+
+//------------------------------------------------------------------------------
+// Misc utils
+
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
+
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.
+
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
+
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
+
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif  // WEBP_DEC_WEBPI_H
+#endif  /* WEBP_DEC_WEBPI_H_ */
--- a/src/dec/yuv.h
+++ b/src/dec/yuv.h
@@ -1,66 +0,0 @@
-// Copyright 2010 Google Inc.
-//
-// This code is licensed under the same terms as WebM:
-//  Software License Agreement:  http://www.webmproject.org/license/software/
-//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
-// -----------------------------------------------------------------------------
-//
-// inline YUV->RGB conversion function
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#ifndef WEBP_DEC_YUV_H_
-#define WEBP_DEC_YUV_H_
-
-#include "webp/decode_vp8.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-enum { YUV_FIX = 16,                // fixed-point precision
-       YUV_RANGE_MIN = -227,        // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
-};
-extern int16_t VP8kVToR[256], VP8kUToB[256];
-extern int32_t VP8kVToG[256], VP8kUToG[256];
-extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-}
-
-inline static void VP8YuvToRgba(int y, int u, int v, uint8_t* const rgba) {
-  VP8YuvToRgb(y, u, v, rgba);
-  rgba[3] = 0xff;
-}
-
-inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
-                               uint8_t* const bgr) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-}
-
-inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) {
-  VP8YuvToBgr(y, u, v, bgra);
-  bgra[3] = 0xff;
-}
-
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
-
-#endif  // WEBP_DEC_YUV_H_
--- a/src/dsp/Makefile.am
+++ b/src/dsp/Makefile.am
@@ -0,0 +1,14 @@
+AM_CPPFLAGS = -I$(top_srcdir)/src
+
+libwebpdsp_la_SOURCES = dsp.h cpu.c \
+                        enc.c enc_sse2.c \
+                        dec.c dec_sse2.c dec_neon.c \
+                        upsampling.c upsampling_sse2.c \
+                        yuv.h yuv.c
+libwebpdsp_la_LDFLAGS = -version-info 0:0:0 -lm
+libwebpdsp_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebpdspinclude_HEADERS = ../webp/types.h
+libwebpdspincludedir = $(includedir)/webp
+
+noinst_HEADERS = dsp.h yuv.h
+noinst_LTLIBRARIES = libwebpdsp.la
--- a/src/dsp/cpu.c
+++ b/src/dsp/cpu.c
@@ -0,0 +1,70 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include <stddef.h>  // for NULL
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+#if defined(__pic__) && defined(__i386__)
+static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(_MSC_VER)  // Visual C++
+#define GetCPUInfo __cpuid
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+  if (feature == kSSE3) {
+    return 0 != (cpu_info[2] & 0x00000001);
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(__ARM_NEON__)
+// define a dummy function to enable turning off NEON at runtime by setting
+// VP8DecGetCPUInfo = NULL
+static int armCPUInfo(CPUFeature feature) {
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/dec.c
+++ b/src/dsp/dec.c
@@ -5,21 +5,18 @@
 //  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
 // -----------------------------------------------------------------------------
 //
-// speed-critical functions.
+// Speed-critical decoding functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "vp8i.h"
-
-#if defined(__SSE2__)
-#include <emmintrin.h>
-#endif
+#include "./dsp.h"
+#include "../dec/vp8i.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // run-time tables (~4k)

 static uint8_t abs0[255 + 255 + 1];     // abs(i)
@@ -32,7 +29,7 @@ static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;

-void VP8DspInitTables(void) {
+static void DspInitTables(void) {
  if (!tables_ok) {
    int i;
    for (i = -255; i <= 255; ++i) {
@@ -56,7 +53,7 @@ static inline uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
@@ -66,7 +63,7 @@ static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)

-static void Transform(const int16_t* in, uint8_t* dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@@ -106,11 +103,16 @@ static void Transform(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL

+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
 static void TransformUV(const int16_t* in, uint8_t* dst) {
-  Transform(in + 0 * 16, dst);
-  Transform(in + 1 * 16, dst + 4);
-  Transform(in + 2 * 16, dst + 4 * BPS);
-  Transform(in + 3 * 16, dst + 4 * BPS + 4);
+  VP8Transform(in + 0 * 16, dst, 1);
+  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }

 static void TransformDC(const int16_t *in, uint8_t* dst) {
@@ -132,13 +134,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {

 #undef STORE

-// default C implementations:
-VP8Idct VP8Transform = Transform;
-VP8Idct VP8TransformUV = TransformUV;
-VP8Idct VP8TransformDC = TransformDC;
-VP8Idct VP8TransformDCUV = TransformDCUV;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 14.3

 static void TransformWHT(const int16_t* in, int16_t* out) {
@@ -170,10 +166,10 @@ static void TransformWHT(const int16_t* in, int16_t* out) {

 void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions

-#define OUT(x, y) dst[(x) + (y) * BPS]
+#define DST(x, y) dst[(x) + (y) * BPS]

 static inline void TrueMotion(uint8_t *dst, int size) {
  const uint8_t* top = dst - BPS;
@@ -192,7 +188,7 @@ static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
 static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
 static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 16x16

 static void VE16(uint8_t *dst) {     // vertical
@@ -248,7 +244,7 @@ static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // 4x4

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -298,13 +294,13 @@ static void RD4(uint8_t *dst) {   // Down-right
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
-  OUT(0, 3)                                     = AVG3(J, K, L);
-  OUT(0, 2) = OUT(1, 3)                         = AVG3(I, J, K);
-  OUT(0, 1) = OUT(1, 2) = OUT(2, 3)             = AVG3(X, I, J);
-  OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I);
-  OUT(1, 0) = OUT(2, 1) = OUT(3, 2)             = AVG3(B, A, X);
-  OUT(2, 0) = OUT(3, 1)                         = AVG3(C, B, A);
-  OUT(3, 0)                                     = AVG3(D, C, B);
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
 }

 static void LD4(uint8_t *dst) {   // Down-Left
@@ -316,13 +312,13 @@ static void LD4(uint8_t *dst) {   // Down-Left
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
-  OUT(0, 0)                                     = AVG3(A, B, C);
-  OUT(1, 0) = OUT(0, 1)                         = AVG3(B, C, D);
-  OUT(2, 0) = OUT(1, 1) = OUT(0, 2)             = AVG3(C, D, E);
-  OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F);
-  OUT(3, 1) = OUT(2, 2) = OUT(1, 3)             = AVG3(E, F, G);
-  OUT(3, 2) = OUT(2, 3)                         = AVG3(F, G, H);
-  OUT(3, 3)                                     = AVG3(G, H, H);
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
 }

 static void VR4(uint8_t *dst) {   // Vertical-Right
@@ -334,17 +330,17 @@ static void VR4(uint8_t *dst) {   // Vertical-Right
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
-  OUT(0, 0) = OUT(1, 2) = AVG2(X, A);
-  OUT(1, 0) = OUT(2, 2) = AVG2(A, B);
-  OUT(2, 0) = OUT(3, 2) = AVG2(B, C);
-  OUT(3, 0)             = AVG2(C, D);
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);

-  OUT(0, 3) =             AVG3(K, J, I);
-  OUT(0, 2) =             AVG3(J, I, X);
-  OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B);
-  OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C);
-  OUT(3, 1) =             AVG3(B, C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
 }

 static void VL4(uint8_t *dst) {   // Vertical-Left
@@ -356,17 +352,17 @@ static void VL4(uint8_t *dst) {   // Vertical-Left
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
-  OUT(0, 0) =             AVG2(A, B);
-  OUT(1, 0) = OUT(0, 2) = AVG2(B, C);
-  OUT(2, 0) = OUT(1, 2) = AVG2(C, D);
-  OUT(3, 0) = OUT(2, 2) = AVG2(D, E);
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);

-  OUT(0, 1) =             AVG3(A, B, C);
-  OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D);
-  OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E);
-  OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F);
-              OUT(3, 2) = AVG3(E, F, G);
-              OUT(3, 3) = AVG3(F, G, H);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
 }

 static void HU4(uint8_t *dst) {   // Horizontal-Up
@@ -374,14 +370,14 @@ static void HU4(uint8_t *dst) {   // Horizontal-Up
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int L = dst[-1 + 3 * BPS];
-  OUT(0, 0) =             AVG2(I, J);
-  OUT(2, 0) = OUT(0, 1) = AVG2(J, K);
-  OUT(2, 1) = OUT(0, 2) = AVG2(K, L);
-  OUT(1, 0) =             AVG3(I, J, K);
-  OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L);
-  OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L);
-  OUT(3, 2) = OUT(2, 2) =
-    OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L;
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }

 static void HD4(uint8_t *dst) {  // Horizontal-Down
@@ -394,23 +390,24 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];

-  OUT(0, 0) = OUT(2, 1) = AVG2(I, X);
-  OUT(0, 1) = OUT(2, 2) = AVG2(J, I);
-  OUT(0, 2) = OUT(2, 3) = AVG2(K, J);
-  OUT(0, 3)             = AVG2(L, K);
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);

-  OUT(3, 0)             = AVG3(A, B, C);
-  OUT(2, 0)             = AVG3(X, A, B);
-  OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A);
-  OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X);
-  OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I);
-  OUT(1, 3)             = AVG3(L, K, J);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
 }

+#undef DST
 #undef AVG3
 #undef AVG2

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma

 static void VE8uv(uint8_t *dst) {    // vertical
@@ -467,24 +464,24 @@ static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
  Put8x8uv(0x8080808080808080ULL, dst);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // default C implementations

-VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
+VP8PredFunc VP8PredLuma4[/* NUM_BMODES */] = {
  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
 };

-VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
+VP8PredFunc VP8PredLuma16[/*NUM_B_DC_MODES */] = {
  DC16, TM16, VE16, HE16,
  DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };

-VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
+VP8PredFunc VP8PredChroma8[/*NUM_B_DC_MODES */] = {
  DC8uv, TM8uv, VE8uv, HE8uv,
  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions

 // 4 pixels in, 2 pixels out
@@ -546,7 +543,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -583,7 +580,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

 static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size,
@@ -669,26 +666,62 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

-void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16;
-void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16;
-void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8;
-void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8;
-void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
-void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
-void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
-void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
+VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformUV;
+VP8DecIdct VP8TransformDC;
+VP8DecIdct VP8TransformDCUV;

-void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16;
-void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16;
-void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
-void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
+VP8LumaFilterFunc VP8VFilter16;
+VP8LumaFilterFunc VP8HFilter16;
+VP8ChromaFilterFunc VP8VFilter8;
+VP8ChromaFilterFunc VP8HFilter8;
+VP8LumaFilterFunc VP8VFilter16i;
+VP8LumaFilterFunc VP8HFilter16i;
+VP8ChromaFilterFunc VP8VFilter8i;
+VP8ChromaFilterFunc VP8HFilter8i;
+VP8SimpleFilterFunc VP8SimpleVFilter16;
+VP8SimpleFilterFunc VP8SimpleHFilter16;
+VP8SimpleFilterFunc VP8SimpleVFilter16i;
+VP8SimpleFilterFunc VP8SimpleHFilter16i;

-//-----------------------------------------------------------------------------
+extern void VP8DspInitSSE2(void);
+extern void VP8DspInitNEON(void);

 void VP8DspInit(void) {
-  // later we'll plug some SSE2 variant here
+  DspInitTables();
+
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo) {
+#if defined(__SSE2__) || defined(_MSC_VER)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8DspInitSSE2();
+    }
+#elif defined(__GNUC__) && defined(__ARM_NEON__)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
+#endif
+  }
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/dsp/dec_neon.c
+++ b/src/dsp/dec_neon.c
@@ -0,0 +1,168 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of dsp functions and loop filtering.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#if defined(__GNUC__) && defined(__ARM_NEON__)
+
+#include "../dec/vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
+              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+
+#define FLIP_SIGN_BIT2(a, b, s)                                                \
+  "veor     " #a "," #a "," #s "               \n"                             \
+  "veor     " #b "," #b "," #s "               \n"                             \
+
+#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
+  FLIP_SIGN_BIT2(a, b, s)                                                      \
+  FLIP_SIGN_BIT2(c, d, s)                                                      \
+
+#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
+  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
+  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
+  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
+  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
+  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+  "vdup.8     q14, " #thresh "            \n"                                  \
+  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
+
+#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
+  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
+  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
+
+#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
+  "vmov.i8    q15, #0x03                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
+  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
+                                                                               \
+  "vmov.i8    q15, #0x04                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
+  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
+
+// Applies filter on 2 pixels (p0 and q0)
+#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
+  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
+  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
+  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
+  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
+  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
+  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
+  FLIP_SIGN_BIT2(p0, q0, q10)
+
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+
+#define STORE8x2(c1, c2, p,stride)                                             \
+  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
+    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
+    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
+    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
+    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", QRegs
+  );
+}
+
+static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
+    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
+    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
+
+    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
+    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
+    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
+    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
+    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], #1                 \n"  // p - 1
+
+    "vswp        d5, d6                        \n"
+    STORE8x2(d4, d5, [%[p]], %[stride])
+    STORE8x2(d6, d7, [%[p]], %[stride])
+
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", "r4", "r5", "r6", QRegs
+  );
+}
+
+static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16NEON(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16NEON(p, stride, thresh);
+  }
+}
+
+extern void VP8DspInitNEON(void);
+
+void VP8DspInitNEON(void) {
+  VP8SimpleVFilter16 = SimpleVFilter16NEON;
+  VP8SimpleHFilter16 = SimpleHFilter16NEON;
+  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
+  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   // __GNUC__ && __ARM_NEON__
--- a/src/dsp/dec_sse2.c
+++ b/src/dsp/dec_sse2.c
@@ -0,0 +1,898 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of some decoding functions (idct, loop filtering).
+//
+// Author: somnath@google.com (Somnath Banerjee)
+//         cduvivier@google.com (Christian Duvivier)
+
+#if defined(__SSE2__) || defined(_MSC_VER)
+
+#include <emmintrin.h>
+#include "../dec/vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two transforms
+  // in parallel). In the case of only one transform, the second half of the
+  // vectors will just contain random value we'll never use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'dst' and store.
+  {
+    const __m128i zero = _mm_set1_epi16(0);
+    // Load the reference(s).
+    __m128i dst0, dst1, dst2, dst3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+    }
+    // Convert to 16b.
+    dst0 = _mm_unpacklo_epi8(dst0, zero);
+    dst1 = _mm_unpacklo_epi8(dst1, zero);
+    dst2 = _mm_unpacklo_epi8(dst2, zero);
+    dst3 = _mm_unpacklo_epi8(dst3, zero);
+    // Add the inverse transform(s).
+    dst0 = _mm_add_epi16(dst0, T0);
+    dst1 = _mm_add_epi16(dst1, T1);
+    dst2 = _mm_add_epi16(dst2, T2);
+    dst3 = _mm_add_epi16(dst3, T3);
+    // Unsigned saturate to 8b.
+    dst0 = _mm_packus_epi16(dst0, dst0);
+    dst1 = _mm_packus_epi16(dst1, dst1);
+    dst2 = _mm_packus_epi16(dst2, dst2);
+    dst3 = _mm_packus_epi16(dst3, dst3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Loop Filter (Paragraph 15)
+
+// Compute abs(p - q) = subs(p - q) OR subs(q - p)
+#define MM_ABS(p, q)  _mm_or_si128(                                            \
+    _mm_subs_epu8((q), (p)),                                                   \
+    _mm_subs_epu8((p), (q)))
+
+// Shift each byte of "a" by N bits while preserving by the sign bit.
+//
+// It first shifts the lower bytes of the words and then the upper bytes and
+// then merges the results together.
+#define SIGNED_SHIFT_N(a, N) {                                                 \
+  __m128i t = a;                                                               \
+  t = _mm_slli_epi16(t, 8);                                                    \
+  t = _mm_srai_epi16(t, N);                                                    \
+  t = _mm_srli_epi16(t, 8);                                                    \
+                                                                               \
+  a = _mm_srai_epi16(a, N + 8);                                                \
+  a = _mm_slli_epi16(a, 8);                                                    \
+                                                                               \
+  a = _mm_or_si128(t, a);                                                      \
+}
+
+#define FLIP_SIGN_BIT2(a, b) {                                                 \
+  a = _mm_xor_si128(a, sign_bit);                                              \
+  b = _mm_xor_si128(b, sign_bit);                                              \
+}
+
+#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
+  FLIP_SIGN_BIT2(a, b);                                                        \
+  FLIP_SIGN_BIT2(c, d);                                                        \
+}
+
+#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
+  const __m128i zero = _mm_setzero_si128();                                    \
+  const __m128i t1 = MM_ABS(p1, p0);                                           \
+  const __m128i t2 = MM_ABS(q1, q0);                                           \
+                                                                               \
+  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
+  const __m128i t3 = _mm_subs_epu8(t1, h);  /* abs(p1 - p0) - hev_tresh */     \
+  const __m128i t4 = _mm_subs_epu8(t2, h);  /* abs(q1 - q0) - hev_tresh */     \
+                                                                               \
+  not_hev = _mm_or_si128(t3, t4);                                              \
+  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
+}
+
+#define GET_BASE_DELTA(p1, p0, q0, q1, o) {                                    \
+  const __m128i qp0 = _mm_subs_epi8(q0, p0);  /* q0 - p0 */                    \
+  o = _mm_subs_epi8(p1, q1);            /* p1 - q1 */                          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 1 * (q0 - p0) */          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 2 * (q0 - p0) */          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 3 * (q0 - p0) */          \
+}
+
+#define DO_SIMPLE_FILTER(p0, q0, fl) {                                         \
+  const __m128i three = _mm_set1_epi8(3);                                      \
+  const __m128i four = _mm_set1_epi8(4);                                       \
+  __m128i v3 = _mm_adds_epi8(fl, three);                                       \
+  __m128i v4 = _mm_adds_epi8(fl, four);                                        \
+                                                                               \
+  /* Do +4 side */                                                             \
+  SIGNED_SHIFT_N(v4, 3);                /* v4 >> 3  */                         \
+  q0 = _mm_subs_epi8(q0, v4);           /* q0 -= v4 */                         \
+                                                                               \
+  /* Now do +3 side */                                                         \
+  SIGNED_SHIFT_N(v3, 3);                /* v3 >> 3  */                         \
+  p0 = _mm_adds_epi8(p0, v3);           /* p0 += v3 */                         \
+}
+
+// Updates values of 2 pixels at MB edge during complex filtering.
+// Update operations:
+// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)]
+#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
+  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
+  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
+  const __m128i a = _mm_packs_epi16(a_lo7, a_hi7);                             \
+  pi = _mm_adds_epi8(pi, a);                                                   \
+  qi = _mm_subs_epi8(qi, a);                                                   \
+}
+
+static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
+                        const __m128i* q1, int thresh, __m128i *mask) {
+  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+  *mask = _mm_set1_epi8(0xFE);
+  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
+  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2
+
+  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
+  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
+  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+  t1 = _mm_set1_epi8(thresh);
+  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
+  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// Applies filter on 2 pixels (p0 and q0)
+static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
+                             const __m128i* q1, int thresh) {
+  __m128i a, mask;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
+  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
+
+  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+
+  // convert to signed values
+  FLIP_SIGN_BIT2(*p0, *q0);
+
+  GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
+  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
+  DO_SIMPLE_FILTER(*p0, *q0, a);
+
+  // unoffset
+  FLIP_SIGN_BIT2(*p0, *q0);
+}
+
+// Applies filter on 4 pixels (p1, p0, q0 and q1)
+static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1,
+                             const __m128i* mask, int hev_thresh) {
+  __m128i not_hev;
+  __m128i t1, t2, t3;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+
+  // compute hev mask
+  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+
+  // convert to signed values
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+
+  t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
+  t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
+  t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
+  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
+
+  // Do +4 side
+  t2 = _mm_set1_epi8(4);
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  t3 = t2;                           // save t2
+  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
+
+  // Now do +3 side
+  t2 = _mm_set1_epi8(3);
+  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
+
+  t2 = _mm_set1_epi8(1);
+  t3 = _mm_adds_epi8(t3, t2);
+  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+
+  t3 = _mm_and_si128(not_hev, t3);   // if !hev
+  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
+  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
+
+  // unoffset
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+}
+
+// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
+static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
+                             __m128i* q0, __m128i* q1, __m128i *q2,
+                             const __m128i* mask, int hev_thresh) {
+  __m128i a, not_hev;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+
+  // compute hev mask
+  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+
+  // convert to signed values
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+
+  GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
+
+  { // do simple filter on pixels with hev
+    const __m128i m = _mm_andnot_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+    DO_SIMPLE_FILTER(*p0, *q0, f);
+  }
+  { // do strong filter on pixels with not hev
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i nine = _mm_set1_epi16(0x0900);
+    const __m128i sixty_three = _mm_set1_epi16(63);
+
+    const __m128i m = _mm_and_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
+    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
+
+    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
+    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
+    const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
+    const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18
+
+    const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
+    const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63
+
+    const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
+    const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63
+
+    const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
+    const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63
+
+    UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
+    UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
+    UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
+  }
+
+  // unoffset
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+}
+
+// reads 8 rows across a vertical edge.
+//
+// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
+// two Load4x4() to avoid code duplication.
+static inline void Load8x4(const uint8_t* b, int stride,
+                           __m128i* p, __m128i* q) {
+  __m128i t1, t2;
+
+  // Load 0th, 1st, 4th and 5th rows
+  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
+  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
+  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
+  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
+
+  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  t1 = _mm_unpacklo_epi8(r0, r1);
+
+  // Load 2nd, 3rd, 6th and 7th rows
+  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
+  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
+  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
+  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
+
+  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  t2 = _mm_unpacklo_epi8(r0, r1);
+
+  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  r0 = t1;
+  t1 = _mm_unpacklo_epi16(t1, t2);
+  t2 = _mm_unpackhi_epi16(r0, t2);
+
+  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  *p = _mm_unpacklo_epi32(t1, t2);
+  *q = _mm_unpackhi_epi32(t1, t2);
+}
+
+static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride,
+                            __m128i* p1, __m128i* p0,
+                            __m128i* q0, __m128i* q1) {
+  __m128i t1, t2;
+  // Assume the pixels around the edge (|) are numbered as follows
+  //                00 01 | 02 03
+  //                10 11 | 12 13
+  //                 ...  |  ...
+  //                e0 e1 | e2 e3
+  //                f0 f1 | f2 f3
+  //
+  // r0 is pointing to the 0th row (00)
+  // r8 is pointing to the 8th row (80)
+
+  // Load
+  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+  Load8x4(r0, stride, p1, q0);
+  Load8x4(r8, stride, p0, q1);
+
+  t1 = *p1;
+  t2 = *q0;
+  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+  *p1 = _mm_unpacklo_epi64(t1, *p0);
+  *p0 = _mm_unpackhi_epi64(t1, *p0);
+  *q0 = _mm_unpacklo_epi64(t2, *q1);
+  *q1 = _mm_unpackhi_epi64(t2, *q1);
+}
+
+static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    *x = _mm_srli_si128(*x, 4);
+  }
+}
+
+// Transpose back and store
+static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1,
+                             __m128i* p0, __m128i* q0, __m128i* q1) {
+  __m128i t1;
+
+  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi8(*p1, t1);
+  *p1 = _mm_unpackhi_epi8(*p1, t1);
+
+  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+  t1 = *q0;
+  *q0 = _mm_unpacklo_epi8(t1, *q1);
+  *q1 = _mm_unpackhi_epi8(t1, *q1);
+
+  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi16(t1, *q0);
+  *q0 = _mm_unpackhi_epi16(t1, *q0);
+
+  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+  t1 = *p1;
+  *p1 = _mm_unpacklo_epi16(t1, *q1);
+  *q1 = _mm_unpackhi_epi16(t1, *q1);
+
+  Store4x4(p0, r0, stride);
+  r0 += 4 * stride;
+  Store4x4(q0, r0, stride);
+
+  Store4x4(p1, r8, stride);
+  r8 += 4 * stride;
+  Store4x4(q1, r8, stride);
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
+  // Load
+  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
+  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
+  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
+  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
+
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-stride], p0);
+  _mm_storeu_si128((__m128i*)p, q0);
+}
+
+static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
+  __m128i p1, p0, q0, q1;
+
+  p -= 2;  // beginning of p1
+
+  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+}
+
+static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16SSE2(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16SSE2(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+#define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
+  m = MM_ABS(p3, p2);                                                          \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+}
+
+#define MAX_DIFF2(p3, p2, p1, p0, m) {                                         \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+}
+
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
+  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
+  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
+  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
+  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+}
+
+#define LOADUV_H_EDGE(p, u, v, stride) {                                       \
+  p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                               \
+  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)]));        \
+}
+
+#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
+  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
+  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
+  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
+  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+}
+
+#define STOREUV(p, u, v, stride) {                                             \
+  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
+  p = _mm_srli_si128(p, 8);                                                    \
+  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+}
+
+#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) {               \
+  __m128i fl_yes;                                                              \
+  const __m128i it = _mm_set1_epi8(ithresh);                                   \
+  mask = _mm_subs_epu8(mask, it);                                              \
+  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                            \
+  NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes);                            \
+  mask = _mm_and_si128(mask, fl_yes);                                          \
+}
+
+// on macroblock edges
+static void VFilter16SSE2(uint8_t* p, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i t1;
+  __m128i mask;
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
+  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+  _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+  _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
+}
+
+static void HFilter16SSE2(uint8_t* p, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const b = p - 4;
+  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
+  Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+}
+
+// on three inner edges
+static void VFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    // Load p3, p2, p1, p0
+    LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    p += 4 * stride;
+
+    // Load q0, q1, q2, q3
+    LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    // Store
+    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  }
+}
+
+static void HFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  uint8_t* b;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    b = p;
+    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    b += 4;  // beginning of q0
+    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    b -= 2;  // beginning of p1
+    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+
+    p += 4;
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p2, u, v, -3 * stride);
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+  STOREUV(q2, u, v, 2 * stride);
+}
+
+static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const tu = u - 4;
+  uint8_t* const tv = v - 4;
+  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
+  Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
+}
+
+static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4 * stride;
+  v += 4 * stride;
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+}
+
+static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4;  // beginning of q0
+  v += 4;
+  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  u -= 2;  // beginning of p1
+  v -= 2;
+  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+}
+
+extern void VP8DspInitSSE2(void);
+
+void VP8DspInitSSE2(void) {
+  VP8Transform = TransformSSE2;
+
+  VP8VFilter16 = VFilter16SSE2;
+  VP8HFilter16 = HFilter16SSE2;
+  VP8VFilter8 = VFilter8SSE2;
+  VP8HFilter8 = HFilter8SSE2;
+  VP8VFilter16i = VFilter16iSSE2;
+  VP8HFilter16i = HFilter16iSSE2;
+  VP8VFilter8i = VFilter8iSSE2;
+  VP8HFilter8i = HFilter8iSSE2;
+
+  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   //__SSE2__ || _MSC_VER
--- a/src/dsp/dsp.h
+++ b/src/dsp/dsp.h
@@ -0,0 +1,175 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//   Speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_DSP_H_
+#define WEBP_DSP_DSP_H_
+
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// CPU detection
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kNEON
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+extern VP8CPUInfo VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Encoding
+
+int VP8GetAlpha(const int histo[]);
+
+// Transforms
+// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
+//          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
+typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                        int do_two);
+typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
+typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+extern VP8Idct VP8ITransform;
+extern VP8Fdct VP8FTransform;
+extern VP8WHT VP8ITransformWHT;
+extern VP8WHT VP8FTransformWHT;
+// Predictions
+// *dst is the destination block. *top and *left can be NULL.
+typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+                              const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+extern VP8Intra4Preds VP8EncPredLuma4;
+extern VP8IntraPreds VP8EncPredLuma16;
+extern VP8IntraPreds VP8EncPredChroma8;
+
+typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
+typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
+                          const uint16_t* const weights);
+extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
+
+typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy8x8;
+extern VP8BlockCopy VP8Copy16x16;
+// Quantization
+struct VP8Matrix;   // forward declaration
+typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
+                                int n, const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlock VP8EncQuantizeBlock;
+
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                         int start_block, int end_block);
+extern const int VP8DspScan[16 + 4 + 4];
+extern VP8CHisto VP8CollectHistogram;
+
+void VP8EncDspInit(void);   // must be called before using any of the above
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformUV;
+extern VP8DecIdct VP8TransformDC;
+extern VP8DecIdct VP8TransformDCUV;
+extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t* dst);
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outer edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+// must be called before anything using the above
+extern void VP8DspInit(void);
+
+//------------------------------------------------------------------------------
+// WebP I/O
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+#ifdef FANCY_UPSAMPLING
+typedef void (*WebPUpsampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* cur_u, const uint8_t* cur_v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+
+// Fancy upsampling functions to convert YUV to RGB(A) modes
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];
+
+// Initializes SSE2 version of the fancy upsamplers.
+void WebPInitUpsamplersSSE2(void);
+
+#endif    // FANCY_UPSAMPLING
+
+// Point-sampling methods.
+typedef void (*WebPSampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* u, const uint8_t* v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+
+// YUV444->RGB converters
+typedef void (*WebPYUV444Converter)(const uint8_t* y,
+                                    const uint8_t* u, const uint8_t* v,
+                                    uint8_t* dst, int len);
+
+extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+
+// Main function to be called
+void WebPInitUpsamplers(void);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_DSP_H_ */
--- a/src/dsp/enc.c
+++ b/src/dsp/enc.c
@@ -0,0 +1,744 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Speed-critical encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "../enc/vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
+int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
+  int num = 0, den = 0, val = 0;
+  int k;
+  int alpha;
+  // note: changing this loop to avoid the numerous "k + 1" slows things down.
+  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
+    if (histo[k + 1]) {
+      val += histo[k + 1];
+      num += val * (k + 1);
+      den += (k + 1) * (k + 1);
+    }
+  }
+  // we scale the value to a usable [0..255] range
+  alpha = den ? 10 * num / den - 5 : 0;
+  return ClipAlpha(alpha);
+}
+
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                            int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
+  for (j = start_block; j < end_block; ++j) {
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 2;
+      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
+    }
+
+    // Use bin to update histogram.
+    for (k = 0; k < 16; ++k) {
+      histo[out[k]]++;
+    }
+  }
+
+  return VP8GetAlpha(histo);
+}
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static void InitTables(void) {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+}
+
+static inline uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#define STORE(x, y, v) \
+  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];
+    const int b = in[0] - in[8];
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    tmp[0] = a + d;
+    tmp[1] = b + c;
+    tmp[2] = b - c;
+    tmp[3] = a - d;
+    tmp += 4;
+    in++;
+  }
+
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, i, a + d);
+    STORE(1, i, b + c);
+    STORE(2, i, b - c);
+    STORE(3, i, a - d);
+    tmp++;
+  }
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int i;
+  int tmp[16];
+  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
+    const int d0 = src[0] - ref[0];
+    const int d1 = src[1] - ref[1];
+    const int d2 = src[2] - ref[2];
+    const int d3 = src[3] - ref[3];
+    const int a0 = (d0 + d3) << 3;
+    const int a1 = (d1 + d2) << 3;
+    const int a2 = (d1 - d2) << 3;
+    const int a3 = (d0 - d3) << 3;
+    tmp[0 + i * 4] = (a0 + a1);
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
+    tmp[2 + i * 4] = (a0 - a1);
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  7500) >> 12;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);
+    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
+    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
+    const int a3 = (tmp[0 + i] - tmp[12 + i]);
+    out[0 + i] = (a0 + a1 + 7) >> 4;
+    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
+    out[8 + i] = (a0 - a1 + 7) >> 4;
+    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
+  }
+}
+
+static void ITransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
+    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
+    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
+    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
+    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
+    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
+    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
+    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+  }
+}
+
+#undef MUL
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+
+static inline void Fill(uint8_t* dst, int value, int size) {
+  int j;
+  for (j = 0; j < size; ++j) {
+    memset(dst + j * BPS, value, size);
+  }
+}
+
+static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) {
+  int j;
+  if (top) {
+    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
+  } else {
+    Fill(dst, 127, size);
+  }
+}
+
+static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) {
+  if (left) {
+    int j;
+    for (j = 0; j < size; ++j) {
+      memset(dst + j * BPS, left[j], size);
+    }
+  } else {
+    Fill(dst, 129, size);
+  }
+}
+
+static inline void TrueMotion(uint8_t* dst, const uint8_t* left,
+                              const uint8_t* top, int size) {
+  int y;
+  if (left) {
+    if (top) {
+      const uint8_t* const clip = clip1 + 255 - left[-1];
+      for (y = 0; y < size; ++y) {
+        const uint8_t* const clip_table = clip + left[y];
+        int x;
+        for (x = 0; x < size; ++x) {
+          dst[x] = clip_table[top[x]];
+        }
+        dst += BPS;
+      }
+    } else {
+      HorizontalPred(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top) {
+      VerticalPred(dst, top, size);
+    } else {
+      Fill(dst, 129, size);
+    }
+  }
+}
+
+static inline void DCMode(uint8_t* dst, const uint8_t* left,
+                          const uint8_t* top,
+                          int size, int round, int shift) {
+  int DC = 0;
+  int j;
+  if (top) {
+    for (j = 0; j < size; ++j) DC += top[j];
+    if (left) {   // top and left present
+      for (j = 0; j < size; ++j) DC += left[j];
+    } else {      // top, but no left
+      DC += DC;
+    }
+    DC = (DC + round) >> shift;
+  } else if (left) {   // left but no top
+    for (j = 0; j < size; ++j) DC += left[j];
+    DC += DC;
+    DC = (DC + round) >> shift;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  Fill(dst, DC, size);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top) top += 8;
+  if (left) left += 16;
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, 4);
+  }
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
+  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
+  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
+  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill(dst, dc >> 3, 4);
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int x, y;
+  const uint8_t* const clip = clip1 + 255 - top[-1];
+  for (y = 0; y < 4; ++y) {
+    const uint8_t* const clip_table = clip + top[-2 - y];
+    for (x = 0; x < 4; ++x) {
+      dst[x] = clip_table[top[x]];
+    }
+    dst += BPS;
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
+  int count = 0;
+  int y, x;
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const int diff = (int)a[x] - b[x];
+      count += diff * diff;
+    }
+    a += BPS;
+    b += BPS;
+  }
+  return count;
+}
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 16);
+}
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 8);
+}
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 8, 8);
+}
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 4, 4);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
+  int tmp[16];
+  int i;
+  // horizontal pass
+  for (i = 0; i < 4; ++i, in += BPS) {
+    const int a0 = (in[0] + in[2]) << 2;
+    const int a1 = (in[1] + in[3]) << 2;
+    const int a2 = (in[1] - in[3]) << 2;
+    const int a3 = (in[0] - in[2]) << 2;
+    tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    sum += w[ 0] * ((abs(b0) + 3) >> 3);
+    sum += w[ 4] * ((abs(b1) + 3) >> 3);
+    sum += w[ 8] * ((abs(b2) + 3) >> 3);
+    sum += w[12] * ((abs(b3) + 3) >> 3);
+  }
+  return sum;
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return (abs(sum2 - sum1) + 8) >> 4;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+// Simple quantization
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         int n, const VP8Matrix* const mtx) {
+  int last = -1;
+  for (; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    if (coeff > 2047) coeff = 2047;
+    if (coeff > mtx->zthresh_[j]) {
+      const int Q = mtx->q_[j];
+      const int iQ = mtx->iq_[j];
+      const int B = mtx->bias_[j];
+      out[n] = QUANTDIV(coeff, iQ, B);
+      if (sign) out[n] = -out[n];
+      in[j] = out[n] * Q;
+      if (out[n]) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
+//------------------------------------------------------------------------------
+// Block copy
+
+static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memcpy(dst, src, size);
+    src += BPS;
+    dst += BPS;
+  }
+}
+
+static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
+static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }
+
+//------------------------------------------------------------------------------
+// Initialization
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8WHT VP8ITransformWHT;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy8x8;
+VP8BlockCopy VP8Copy16x16;
+
+extern void VP8EncDspInitSSE2(void);
+
+void VP8EncDspInit(void) {
+  InitTables();
+
+  // default C implementations
+  VP8CollectHistogram = CollectHistogram;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8Copy4x4 = Copy4x4;
+  VP8Copy8x8 = Copy8x8;
+  VP8Copy16x16 = Copy16x16;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo) {
+#if defined(__SSE2__) || defined(_MSC_VER)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspInitSSE2();
+    }
+#endif
+  }
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/enc_sse2.c
+++ b/src/dsp/enc_sse2.c
@@ -0,0 +1,834 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical encoding functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#if defined(__SSE2__) || defined(_MSC_VER)
+#include <emmintrin.h>
+
+#include "../enc/vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  for (j = start_block; j < end_block; ++j) {
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign0 = _mm_srai_epi16(out0, 15);
+      const __m128i sign1 = _mm_srai_epi16(out1, 15);
+      // abs(out) = (out ^ sign) - sign
+      const __m128i xor0 = _mm_xor_si128(out0, sign0);
+      const __m128i xor1 = _mm_xor_si128(out1, sign1);
+      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
+      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
+      // v = abs(out) >> 2
+      const __m128i v0 = _mm_srai_epi16(abs0, 2);
+      const __m128i v1 = _mm_srai_epi16(abs1, 2);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Use bin to update histogram.
+    for (k = 0; k < 16; ++k) {
+      histo[out[k]]++;
+    }
+  }
+
+  return VP8GetAlpha(histo);
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two inverse
+  // transforms in parallel). In the case of only one inverse transform, the
+  // second half of the vectors will just contain random value we'll never
+  // use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    const __m128i zero = _mm_set1_epi16(0);
+    // Load the reference(s).
+    __m128i ref0, ref1, ref2, ref3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
+      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
+      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
+      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
+    }
+    // Convert to 16b.
+    ref0 = _mm_unpacklo_epi8(ref0, zero);
+    ref1 = _mm_unpacklo_epi8(ref1, zero);
+    ref2 = _mm_unpacklo_epi8(ref2, zero);
+    ref3 = _mm_unpacklo_epi8(ref3, zero);
+    // Add the inverse transform(s).
+    ref0 = _mm_add_epi16(ref0, T0);
+    ref1 = _mm_add_epi16(ref1, T1);
+    ref2 = _mm_add_epi16(ref2, T2);
+    ref3 = _mm_add_epi16(ref3, T3);
+    // Unsigned saturate to 8b.
+    ref0 = _mm_packus_epi16(ref0, ref0);
+    ref1 = _mm_packus_epi16(ref1, ref1);
+    ref2 = _mm_packus_epi16(ref2, ref2);
+    ref3 = _mm_packus_epi16(ref3, ref3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
+    }
+  }
+}
+
+static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k7500 = _mm_set1_epi32(7500);
+  const __m128i k14500 = _mm_set1_epi32(14500);
+  const __m128i k51000 = _mm_set1_epi32(51000);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+
+  __m128i v01, v32;
+
+  // Difference between src and ref and initial transpose.
+  {
+    // Load src and convert to 16b.
+    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
+    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
+    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
+    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
+    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+    // Load ref and convert to 16b.
+    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+    // Compute difference.
+    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+    // Transpose.
+    // 00 01 02 03   0 0 0 0
+    // 10 11 12 13   0 0 0 0
+    // 20 21 22 23   0 0 0 0
+    // 30 31 32 33   0 0 0 0
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // a02 a12 a22 a32   a03 a13 a23 a33
+    // a00 a10 a20 a30   a01 a11 a21 a31
+    // a03 a13 a23 a33   a02 a12 a22 a32
+  }
+
+  // First pass and subsequent transpose.
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // b0 = (a0 + a3) << 3
+    // b1 = (a1 + a2) << 3
+    // b3 = (a0 - a3) << 3
+    // b2 = (a1 - a2) << 3
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i b01 = _mm_slli_epi16(a01, 3);
+    const __m128i b32 = _mm_slli_epi16(a32, 3);
+    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
+    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
+
+    // e0 = b0 + b1
+    // e2 = b0 - b1
+    const __m128i e0 = _mm_add_epi16(b01, b11);
+    const __m128i e2 = _mm_sub_epi16(b01, b11);
+    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
+
+    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
+    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
+    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k14500);
+    const __m128i d3 = _mm_add_epi32(c3, k7500);
+    const __m128i e1 = _mm_srai_epi32(d1, 12);
+    const __m128i e3 = _mm_srai_epi32(d3, 12);
+    const __m128i e13 = _mm_packs_epi32(e1, e3);
+
+    // Transpose.
+    // 00 01 02 03  20 21 22 23
+    // 10 11 12 13  30 31 32 33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
+    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // 02 12 22 32   03 13 23 33
+    // 00 10 20 30   01 11 21 31
+    // 03 13 23 33   02 12 22 32
+  }
+
+  // Second pass
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // a0 = v0 + v3
+    // a1 = v1 + v2
+    // a3 = v0 - v3
+    // a2 = v1 - v2
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+
+    // d0 = (a0 + a1 + 7) >> 4;
+    // d2 = (a0 - a1 + 7) >> 4;
+    const __m128i b0 = _mm_add_epi16(a01, a11);
+    const __m128i b2 = _mm_sub_epi16(a01, a11);
+    const __m128i c0 = _mm_add_epi16(b0, seven);
+    const __m128i c2 = _mm_add_epi16(b2, seven);
+    const __m128i d0 = _mm_srai_epi16(c0, 4);
+    const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+    const __m128i d3 = _mm_add_epi32(c3, k51000);
+    const __m128i e1 = _mm_srai_epi32(d1, 16);
+    const __m128i e3 = _mm_srai_epi32(d3, 16);
+    const __m128i f1 = _mm_packs_epi32(e1, e1);
+    const __m128i f3 = _mm_packs_epi32(e3, e3);
+    // f1 = f1 + (a3 != 0);
+    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+    // desired (0, 1), we add one earlier through k12000_plus_one.
+    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+    _mm_storel_epi64((__m128i*)&out[ 0], d0);
+    _mm_storel_epi64((__m128i*)&out[ 4], g1);
+    _mm_storel_epi64((__m128i*)&out[ 8], d2);
+    _mm_storel_epi64((__m128i*)&out[12], f3);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_set1_epi16(0);
+
+  // Load values.
+  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
+
+  // Combine pair of lines and convert to 16b.
+  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+
+  // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
+  // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
+  //                  need absolute values, there is no need to do calculation
+  //                  in 8bit as we are already in 16bit, ... Yet this is what
+  //                  benchmarks the fastest!
+  const __m128i d0 = _mm_subs_epu8(a01s, b01s);
+  const __m128i d1 = _mm_subs_epu8(b01s, a01s);
+  const __m128i d2 = _mm_subs_epu8(a23s, b23s);
+  const __m128i d3 = _mm_subs_epu8(b23s, a23s);
+
+  // Square and add them all together.
+  const __m128i madd0 = _mm_madd_epi16(d0, d0);
+  const __m128i madd1 = _mm_madd_epi16(d1, d1);
+  const __m128i madd2 = _mm_madd_epi16(d2, d2);
+  const __m128i madd3 = _mm_madd_epi16(d3, d3);
+  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+  int32_t tmp[4];
+  _mm_storeu_si128((__m128i*)tmp, sum2);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the difference between the weighted sum of the absolute value of
+// transformed coefficients.
+static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
+                          const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i three = _mm_set1_epi16(3);
+
+  // Load, combine and tranpose inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
+    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
+    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
+    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
+    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
+
+    // Transpose the two 4x4, discarding the filling zeroes.
+    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
+    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
+    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
+    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
+    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
+
+    // Convert to 16b.
+    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
+    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
+    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
+    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
+    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
+    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
+    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
+    // b0_extra = (a0 != 0);
+    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
+    const __m128i b0_base = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Vertical pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    // TODO(cduvivier): Make variable declarations and allocations aligned so
+    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    {
+      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
+      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
+      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
+      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
+
+      // b = abs(b) = (b ^ sign) - sign
+      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
+      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
+      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
+      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
+      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
+      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
+      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
+      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
+    }
+
+    // b = abs(b) + 3
+    A_b0 = _mm_add_epi16(A_b0, three);
+    A_b2 = _mm_add_epi16(A_b2, three);
+    B_b0 = _mm_add_epi16(B_b0, three);
+    B_b2 = _mm_add_epi16(B_b2, three);
+
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    // b = (abs(b) + 3) >> 3
+    A_b0 = _mm_srai_epi16(A_b0, 3);
+    A_b2 = _mm_srai_epi16(A_b2, 3);
+    B_b0 = _mm_srai_epi16(B_b0, 3);
+    B_b2 = _mm_srai_epi16(B_b2, 3);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b0 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int diff_sum = TTransformSSE2(a, b, w);
+  return (abs(diff_sum) + 8) >> 4;
+}
+
+static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4SSE2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Simple quantization
+static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
+                             int n, const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i sign0, sign8;
+  __m128i coeff0, coeff8;
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  // TODO(cduvivier): Make variable declarations and allocations aligned so that
+  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
+  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
+  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
+  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
+  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
+  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
+  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
+
+  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
+  sign0 = _mm_srai_epi16(in0, 15);
+  sign8 = _mm_srai_epi16(in8, 15);
+
+  // coeff = abs(in) = (in ^ sign) - sign
+  coeff0 = _mm_xor_si128(in0, sign0);
+  coeff8 = _mm_xor_si128(in8, sign8);
+  coeff0 = _mm_sub_epi16(coeff0, sign0);
+  coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+  // coeff = abs(in) + sharpen
+  coeff0 = _mm_add_epi16(coeff0, sharpen0);
+  coeff8 = _mm_add_epi16(coeff8, sharpen8);
+
+  // if (coeff > 2047) coeff = 2047
+  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
+  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
+
+  // out = (coeff * iQ + B) >> QFIX;
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // expand bias from 16b to 32b
+    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
+    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
+    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
+    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
+    // out = (coeff * iQ + B)
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = (coeff * iQ + B) >> QFIX;
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+  }
+
+  // get sign back (if (sign[j]) out_n = -out_n)
+  out0 = _mm_xor_si128(out0, sign0);
+  out8 = _mm_xor_si128(out8, sign8);
+  out0 = _mm_sub_epi16(out0, sign0);
+  out8 = _mm_sub_epi16(out8, sign8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
+  {
+    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
+    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
+    in0 = _mm_and_si128(in0, cmp0);
+    in8 = _mm_and_si128(in8, cmp8);
+    _mm_storeu_si128((__m128i*)&in[0], in0);
+    _mm_storeu_si128((__m128i*)&in[8], in8);
+    out0 = _mm_and_si128(out0, cmp0);
+    out8 = _mm_and_si128(out8, cmp8);
+  }
+
+  // zigzag the output before storing it.
+  //
+  // The zigzag pattern can almost be reproduced with a small sequence of
+  // shuffles. After it, we only need to swap the 7th (ending up in third
+  // position instead of twelfth) and 8th values.
+  {
+    __m128i outZ0, outZ8;
+    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
+    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
+    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+    _mm_storeu_si128((__m128i*)&out[0], outZ0);
+    _mm_storeu_si128((__m128i*)&out[8], outZ8);
+    packed_out = _mm_packs_epi16(outZ0, outZ8);
+  }
+  {
+    const int16_t outZ_12 = out[12];
+    const int16_t outZ_3 = out[3];
+    out[3] = outZ_12;
+    out[12] = outZ_3;
+  }
+
+  // detect if all 'out' values are zeroes or not
+  {
+    int32_t tmp[4];
+    _mm_storeu_si128((__m128i*)tmp, packed_out);
+    if (n) {
+      tmp[0] &= ~0xff;
+    }
+    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
+  }
+}
+
+extern void VP8EncDspInitSSE2(void);
+void VP8EncDspInitSSE2(void) {
+  VP8CollectHistogram = CollectHistogramSSE2;
+  VP8EncQuantizeBlock = QuantizeBlockSSE2;
+  VP8ITransform = ITransformSSE2;
+  VP8FTransform = FTransformSSE2;
+  VP8SSE4x4 = SSE4x4SSE2;
+  VP8TDisto4x4 = Disto4x4SSE2;
+  VP8TDisto16x16 = Disto16x16SSE2;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   //__SSE2__
--- a/src/dsp/upsampling.c
+++ b/src/dsp/upsampling.c
@@ -0,0 +1,226 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+#include "./yuv.h"
+#include "../dec/webpi.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB
+WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
+WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST];
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u,v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+// These two don't erase the alpha value
+UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4)
+UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4)
+UPSAMPLE_FUNC(UpsampleArgbKeepAlphaLinePair, VP8YuvToArgbKeepA, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444KeepAlphaLinePair, VP8YuvToRgba4444KeepA, 2)
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int i;                                                                       \
+  for (i = 0; i < len - 1; i += 2) {                                           \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(top_y[1], u[0], v[0], top_dst + XSTEP);                               \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+    FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP);                         \
+    top_y += 2;                                                                \
+    bottom_y += 2;                                                             \
+    u++;                                                                       \
+    v++;                                                                       \
+    top_dst += 2 * XSTEP;                                                      \
+    bottom_dst += 2 * XSTEP;                                                   \
+  }                                                                            \
+  if (i == len - 1) {    /* last one */                                        \
+    FUNC(top_y[0], u[0], v[0], top_dst);                                       \
+    FUNC(bottom_y[0], u[0], v[0], bottom_dst);                                 \
+  }                                                                            \
+}
+
+// All variants implemented.
+SAMPLE_FUNC(SampleRgbLinePair,      VP8YuvToRgb,  3)
+SAMPLE_FUNC(SampleBgrLinePair,      VP8YuvToBgr,  3)
+SAMPLE_FUNC(SampleRgbaLinePair,     VP8YuvToRgba, 4)
+SAMPLE_FUNC(SampleBgraLinePair,     VP8YuvToBgra, 4)
+SAMPLE_FUNC(SampleArgbLinePair,     VP8YuvToArgb, 4)
+SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+SAMPLE_FUNC(SampleRgb565LinePair,   VP8YuvToRgb565, 2)
+
+#undef SAMPLE_FUNC
+
+const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = {
+  SampleRgbLinePair,       // MODE_RGB
+  SampleRgbaLinePair,      // MODE_RGBA
+  SampleBgrLinePair,       // MODE_BGR
+  SampleBgraLinePair,      // MODE_BGRA
+  SampleArgbLinePair,      // MODE_ARGB
+  SampleRgba4444LinePair,  // MODE_RGBA_4444
+  SampleRgb565LinePair     // MODE_RGB_565
+};
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
+YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
+YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
+
+#undef YUV444_FUNC
+
+const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
+  Yuv444ToRgb,       // MODE_RGB
+  Yuv444ToRgba,      // MODE_RGBA
+  Yuv444ToBgr,       // MODE_BGR
+  Yuv444ToBgra,      // MODE_BGRA
+  Yuv444ToArgb,      // MODE_ARGB
+  Yuv444ToRgba4444,  // MODE_RGBA_4444
+  Yuv444ToRgb565     // MODE_RGB_565
+};
+
+//------------------------------------------------------------------------------
+// Main call
+
+void WebPInitUpsamplers(void) {
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+
+  WebPUpsamplersKeepAlpha[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplersKeepAlpha[MODE_RGBA]      = UpsampleRgbKeepAlphaLinePair;
+  WebPUpsamplersKeepAlpha[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplersKeepAlpha[MODE_BGRA]      = UpsampleBgrKeepAlphaLinePair;
+  WebPUpsamplersKeepAlpha[MODE_ARGB]      = UpsampleArgbKeepAlphaLinePair;
+  WebPUpsamplersKeepAlpha[MODE_RGBA_4444] = UpsampleRgba4444KeepAlphaLinePair;
+  WebPUpsamplersKeepAlpha[MODE_RGB_565]   = UpsampleRgb565LinePair;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo) {
+#if defined(__SSE2__) || defined(_MSC_VER)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitUpsamplersSSE2();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/dsp/upsampling_sse2.c
+++ b/src/dsp/upsampling_sse2.c
@@ -0,0 +1,215 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#if defined(__SSE2__) || defined(_MSC_VER)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+#include "./dsp.h"
+#include "./yuv.h"
+#include "../dec/webpi.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef FANCY_UPSAMPLING
+
+// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+// u = (9*a + 3*b + 3*c + d + 8) / 16
+//   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+//   = (a + m + 1) / 2
+// where m = (a + 3*b + 3*c + d) / 8
+//         = ((a + b + c + d) / 2 + b + c) / 4
+//
+// Let's say  k = (a + b + c + d) / 4.
+// We can compute k as
+// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+//
+// Then m can be written as
+// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+
+// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+#define GET_M(ij, in, out) do {                                                \
+  const __m128i tmp0 = _mm_avg_epu8(k, (in));     /* (k + in + 1) / 2 */       \
+  const __m128i tmp1 = _mm_and_si128((ij), st);   /* (ij) & (s^t) */           \
+  const __m128i tmp2 = _mm_xor_si128(k, (in));    /* (k^in) */                 \
+  const __m128i tmp3 = _mm_or_si128(tmp1, tmp2);  /* ((ij) & (s^t)) | (k^in) */\
+  const __m128i tmp4 = _mm_and_si128(tmp3, one);  /* & 1 -> lsb_correction */  \
+  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
+} while (0)
+
+// pack and store two alterning pixel rows
+#define PACK_AND_STORE(a, b, da, db, out) do {                                 \
+  const __m128i ta = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */  \
+  const __m128i tb = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */  \
+  const __m128i t1 = _mm_unpacklo_epi8(ta, tb);                                \
+  const __m128i t2 = _mm_unpackhi_epi8(ta, tb);                                \
+  _mm_store_si128(((__m128i*)(out)) + 0, t1);                                  \
+  _mm_store_si128(((__m128i*)(out)) + 1, t2);                                  \
+} while (0)
+
+// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+  const __m128i one = _mm_set1_epi8(1);                                        \
+  const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]);                       \
+  const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]);                       \
+  const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]);                       \
+  const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]);                       \
+                                                                               \
+  const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
+  const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
+  const __m128i st = _mm_xor_si128(s, t);      /* st = s^t */                  \
+                                                                               \
+  const __m128i ad = _mm_xor_si128(a, d);      /* ad = a^d */                  \
+  const __m128i bc = _mm_xor_si128(b, c);      /* bc = b^c */                  \
+                                                                               \
+  const __m128i t1 = _mm_or_si128(ad, bc);     /* (a^d) | (b^c) */             \
+  const __m128i t2 = _mm_or_si128(t1, st);     /* (a^d) | (b^c) | (s^t) */     \
+  const __m128i t3 = _mm_and_si128(t2, one);   /* (a^d) | (b^c) | (s^t) & 1 */ \
+  const __m128i t4 = _mm_avg_epu8(s, t);                                       \
+  const __m128i k = _mm_sub_epi8(t4, t3);      /* k = (a + b + c + d) / 4 */   \
+  __m128i diag1, diag2;                                                        \
+                                                                               \
+  GET_M(bc, t, diag1);                  /* diag1 = (a + 3b + 3c + d) / 8 */    \
+  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
+                                                                               \
+  /* pack the alternate pixels */                                              \
+  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
+  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
+                             uint8_t* const out) {
+  UPSAMPLE_32PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                         \
+  uint8_t r1[17], r2[17];                                                      \
+  memcpy(r1, (tb), (num_pixels));                                              \
+  memcpy(r2, (bb), (num_pixels));                                              \
+  /* replicate last byte */                                                    \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
+  /* using the shared function instead of the macro saves ~3k code size */     \
+  Upsample32Pixels(r1, r2, out);                                               \
+}
+
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
+                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
+  int n;                                                                       \
+  if (top_y) {                                                                 \
+    for (n = 0; n < (num_pixels); ++n) {                                       \
+      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
+           top_dst + ((cur_x) + n) * XSTEP);                                   \
+    }                                                                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    for (n = 0; n < (num_pixels); ++n) {                                       \
+      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
+           bottom_dst + ((cur_x) + n) * XSTEP);                                \
+    }                                                                          \
+  }                                                                            \
+}
+
+#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int b;                                                                       \
+  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  uint8_t uv_buf[4 * 32 + 15];                                                 \
+  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
+  const int uv_len = (len + 1) >> 1;                                           \
+  /* 17 pixels must be read-able for each block */                             \
+  const int num_blocks = (uv_len - 1) >> 4;                                    \
+  const int leftover = uv_len - num_blocks * 16;                               \
+  const int last_pos = 1 + 32 * num_blocks;                                    \
+                                                                               \
+  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
+  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
+                                                                               \
+  assert(len > 0);                                                             \
+  /* Treat the first pixel in regular way */                                   \
+  if (top_y) {                                                                 \
+    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
+    FUNC(top_y[0], u0, v0, top_dst);                                           \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
+    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
+    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
+  }                                                                            \
+                                                                               \
+  for (b = 0; b < num_blocks; ++b) {                                           \
+    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
+    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
+                32 * b + 1, 32)                                                \
+    top_u += 16;                                                               \
+    cur_u += 16;                                                               \
+    top_v += 16;                                                               \
+    cur_v += 16;                                                               \
+  }                                                                            \
+                                                                               \
+  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
+  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
+  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
+              last_pos, len - last_pos);                                       \
+}
+
+// SSE2 variants of the fancy upsampler.
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
+// These two don't erase the alpha value
+SSE2_UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePairSSE2, VP8YuvToRgb, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4)
+
+#undef GET_M
+#undef PACK_AND_STORE
+#undef UPSAMPLE_32PIXELS
+#undef UPSAMPLE_LAST_BLOCK
+#undef CONVERT2RGB
+#undef SSE2_UPSAMPLE_FUNC
+
+//------------------------------------------------------------------------------
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */];
+
+#endif  // FANCY_UPSAMPLING
+
+void WebPInitUpsamplersSSE2(void) {
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;
+
+  WebPUpsamplersKeepAlpha[MODE_RGB]  = UpsampleRgbLinePairSSE2;
+  WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePairSSE2;
+  WebPUpsamplersKeepAlpha[MODE_BGR]  = UpsampleBgrLinePairSSE2;
+  WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePairSSE2;
+#endif  // FANCY_UPSAMPLING
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   //__SSE2__ || _MSC_VER
--- a/src/dsp/yuv.c
+++ b/src/dsp/yuv.c
@@ -9,7 +9,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "yuv.h"
+#include "./yuv.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -20,9 +20,14 @@ enum { YUV_HALF = 1 << (YUV_FIX - 1) };
 int16_t VP8kVToR[256], VP8kUToB[256];
 int32_t VP8kVToG[256], VP8kUToG[256];
 uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];

 static int done = 0;

+static inline uint8_t clip(int v, int max_value) {
+  return v < 0 ? 0 : v > max_value ? max_value : v;
+}
+
 void VP8YUVInit(void) {
  int i;
  if (done) {
@@ -36,7 +41,8 @@ void VP8YUVInit(void) {
  }
  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
-    VP8kClip[i - YUV_RANGE_MIN] = (k < 0) ? 0 : (k > 255) ? 255 : k;
+    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
  }
  done = 1;
 }
--- a/src/dsp/yuv.h
+++ b/src/dsp/yuv.h
@@ -0,0 +1,109 @@
+// Copyright 2010 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// inline YUV->RGB conversion function
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_YUV_H_
+#define WEBP_DSP_YUV_H_
+
+#include "../webp/decode_vp8.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+enum { YUV_FIX = 16,                // fixed-point precision
+       YUV_RANGE_MIN = -227,        // min value of r/g/b output
+       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+};
+extern int16_t VP8kVToR[256], VP8kUToB[256];
+extern int32_t VP8kVToG[256], VP8kUToG[256];
+extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+                               uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+}
+
+static inline void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+            (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+            (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+}
+
+static inline void VP8YuvToArgbKeepA(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  // Don't update Aplha (argb[0])
+  VP8YuvToRgb(y, u, v, argb + 1);
+}
+
+static inline void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToArgbKeepA(y, u, v, argb);
+}
+
+static inline void VP8YuvToRgba4444KeepA(uint8_t y, uint8_t u, uint8_t v,
+                                         uint8_t* const argb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  // Don't update Aplha (last 4 bits of argb[1])
+  argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+             VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  argb[1] = (argb[1] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4);
+}
+
+static inline void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+                                    uint8_t* const argb) {
+  argb[1] = 0x0f;
+  VP8YuvToRgba4444KeepA(y, u, v, argb);
+}
+
+static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+                               uint8_t* const bgr) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+}
+
+static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const bgra) {
+  VP8YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+// Must be called before everything, to initialize the tables.
+void VP8YUVInit(void);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_YUV_H_ */
--- a/src/enc/Makefile.am
+++ b/src/enc/Makefile.am
@@ -1,15 +1,13 @@
 AM_CPPFLAGS = -I$(top_srcdir)/src

-libwebpencode_la_SOURCES = analysis.c bit_writer.c bit_writer.h \
-                          config.c cost.c cost.h dsp.c filter.c \
-                          frame.c iterator.c picture.c quant.c  \
-                          syntax.c tree.c vp8enci.h webpenc.c
-libwebpencode_la_LDFLAGS = -version-info 0:0:0 -lm
+libwebpencode_la_SOURCES = analysis.c config.c cost.c cost.h filter.c \
+                           frame.c iterator.c picture.c quant.c  \
+                           syntax.c tree.c vp8enci.h webpenc.c alpha.c \
+                           layer.c
+libwebpencode_la_LDFLAGS = -version-info 2:0:0 -lm
+libwebpencode_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
 libwebpencodeinclude_HEADERS = ../webp/encode.h ../webp/types.h
 libwebpencodeincludedir = $(includedir)/webp

-noinst_HEADERS = cost.h bit_writer.h vp8enci.h
+noinst_HEADERS = cost.h vp8enci.h
 noinst_LTLIBRARIES = libwebpencode.la
-# uncomment the following line (and comment the above) if you want
-# to install libwebpencode library.
-#lib_LTLIBRARIES = libwebpencode.la
--- a/src/enc/alpha.c
+++ b/src/enc/alpha.c
@@ -0,0 +1,114 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane compression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "vp8enci.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "zlib.h"
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+
+#define CHUNK_SIZE 8192
+
+//------------------------------------------------------------------------------
+
+static int CompressAlpha(const uint8_t* data, size_t data_size,
+                         uint8_t** output, size_t* output_size,
+                         int algo) {
+  int ret = Z_OK;
+  z_stream strm;
+  unsigned char chunk[CHUNK_SIZE];
+
+  *output = NULL;
+  *output_size = 0;
+  memset(&strm, 0, sizeof(strm));
+  if (deflateInit(&strm, algo ? Z_BEST_SPEED : Z_BEST_COMPRESSION) != Z_OK) {
+    return 0;
+  }
+  strm.next_in = (unsigned char*)data;
+  strm.avail_in = data_size;
+  do {
+    size_t size_out;
+
+    strm.next_out = chunk;
+    strm.avail_out = CHUNK_SIZE;
+    ret = deflate(&strm, Z_FINISH);
+    if (ret == Z_STREAM_ERROR) {
+      break;
+    }
+    size_out = CHUNK_SIZE - strm.avail_out;
+    if (size_out) {
+      size_t new_size = *output_size + size_out;
+      uint8_t* new_output = realloc(*output, new_size);
+      if (new_output == NULL) {
+        ret = Z_MEM_ERROR;
+        break;
+      }
+      memcpy(new_output + *output_size, chunk, size_out);
+      *output_size = new_size;
+      *output = new_output;
+    }
+  } while (ret != Z_STREAM_END || strm.avail_out == 0);
+
+  deflateEnd(&strm);
+  if (ret != Z_STREAM_END) {
+    free(*output);
+    output_size = 0;
+    return 0;
+  }
+  return 1;
+}
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+void VP8EncInitAlpha(VP8Encoder* enc) {
+  enc->has_alpha_ = (enc->pic_->a != NULL);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+}
+
+void VP8EncCodeAlphaBlock(VP8EncIterator* it) {
+  (void)it;
+  // Nothing for now. We just ZLIB-compress in the end.
+}
+
+int VP8EncFinishAlpha(VP8Encoder* enc) {
+  if (enc->has_alpha_) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    const WebPPicture* pic = enc->pic_;
+    assert(pic->a);
+    if (!CompressAlpha(pic->a, pic->width * pic->height,
+                       &enc->alpha_data_, &enc->alpha_data_size_,
+                       enc->config_->alpha_compression)) {
+      return 0;
+    }
+#endif
+  }
+  return 1;
+}
+
+void VP8EncDeleteAlpha(VP8Encoder* enc) {
+  free(enc->alpha_data_);
+  enc->alpha_data_ = NULL;
+  enc->alpha_data_size_ = 0;
+  enc->has_alpha_ = 0;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/analysis.c
+++ b/src/enc/analysis.c
@@ -20,52 +20,13 @@
 extern "C" {
 #endif

-#define MAX_COEFF_THRESH   64
 #define MAX_ITERS_K_MEANS  6

-//-----------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
 static int ClipAlpha(int alpha) {
  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
 }

-static int GetAlpha(const int histo[MAX_COEFF_THRESH]) {
-  int num = 0, den = 0, val = 0;
-  int k;
-  int alpha;
-  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
-    if (histo[k]) {
-      val += histo[k];
-      num += val * (k + 1);
-      den += (k + 1) * (k + 1);
-    }
-  }
-  // we scale the value to a usable [0..255] range
-  alpha = den ? 10 * num / den - 5 : 0;
-  return ClipAlpha(alpha);
-}
-
-static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                            int start_block, int end_block) {
-  int histo[MAX_COEFF_THRESH] = { 0 };
-  int16_t out[16];
-  int j, k;
-  for (j = start_block; j < end_block; ++j) {
-    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
-    for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 2;
-      if (v) {
-        const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
-        histo[bin - 1]++;
-      }
-    }
-  }
-  return GetAlpha(histo);
-}
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Smooth the segment map by replacing isolated block by the majority of its
 // neighbours.

@@ -86,11 +47,11 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
      cnt[mb[-w - 1].segment_]++;  // top-left
      cnt[mb[-w + 0].segment_]++;  // top
      cnt[mb[-w + 1].segment_]++;  // top-right
-      cnt[mb[   - 1].segment_]++;    // left
-      cnt[mb[   + 1].segment_]++;    // right
-      cnt[mb[ w - 1].segment_]++;   // bottom-left
-      cnt[mb[ w + 0].segment_]++;   // bottom
-      cnt[mb[ w + 1].segment_]++;   // bottom-right
+      cnt[mb[   - 1].segment_]++;  // left
+      cnt[mb[   + 1].segment_]++;  // right
+      cnt[mb[ w - 1].segment_]++;  // bottom-left
+      cnt[mb[ w + 0].segment_]++;  // bottom
+      cnt[mb[ w + 1].segment_]++;  // bottom-right
      for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
        if (cnt[n] >= majority_cnt_3_x_3_grid) {
          majority_seg = n;
@@ -108,7 +69,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
  free(tmp);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Finalize Segment probability based on the coding tree

 static int GetProba(int a, int b) {
@@ -178,7 +139,7 @@ static void SetSegmentAlphas(VP8Encoder* const enc,
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram

 static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
@@ -259,7 +220,7 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) {
  SetSegmentAlphas(enc, centers, weighted_average);  // pick some alphas.
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Macroblock analysis: collect histogram for each mode, deduce the maximal
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
@@ -278,9 +239,9 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {

  VP8MakeLuma16Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF,
-                                       it->yuv_p_ + VP8I16ModeOffsets[mode],
-                                       0, 16);
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+                                          it->yuv_p_ + VP8I16ModeOffsets[mode],
+                                          0, 16);
    if (alpha > best_alpha) {
      best_alpha = alpha;
      best_mode = mode;
@@ -303,9 +264,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,

    VP8MakeIntra4Preds(it);
    for (mode = 0; mode < max_mode; ++mode) {
-      const int alpha = CollectHistogram(src,
-                                         it->yuv_p_ + VP8I4ModeOffsets[mode],
-                                         0, 1);
+      const int alpha = VP8CollectHistogram(src,
+                                            it->yuv_p_ + VP8I4ModeOffsets[mode],
+                                            0, 1);
      if (alpha > best_mode_alpha) {
        best_mode_alpha = alpha;
        modes[it->i4_] = mode;
@@ -329,9 +290,9 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
  int mode;
  VP8MakeChroma8Preds(it);
  for (mode = 0; mode < max_mode; ++mode) {
-    const int alpha = CollectHistogram(it->yuv_in_ + U_OFF,
-                                       it->yuv_p_ + VP8UVModeOffsets[mode],
-                                       16, 16 + 4 + 4);
+    const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF,
+                                          it->yuv_p_ + VP8UVModeOffsets[mode],
+                                          16, 16 + 4 + 4);
    if (alpha > best_alpha) {
      best_alpha = alpha;
      best_mode = mode;
@@ -367,7 +328,7 @@ static void MBAnalyze(VP8EncIterator* const it,
  it->mb_->alpha_ = best_alpha;   // Informative only.
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main analysis loop:
 // Collect all susceptibilities for each macroblock and record their
 // distribution in alphas[]. Segments is assigned a-posteriori, based on
--- a/src/enc/config.c
+++ b/src/enc/config.c
@@ -10,15 +10,15 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
-#include "webp/encode.h"
+#include "../webp/encode.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPConfig
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int WebPConfigInitInternal(WebPConfig* const config,
                           WebPPreset preset, float quality, int version) {
@@ -41,6 +41,8 @@ int WebPConfigInitInternal(WebPConfig* const config,
  config->show_compressed = 0;
  config->preprocessing = 0;
  config->autofilter = 0;
+  config->alpha_compression = 0;
+  config->partition_limit = 0;

  // TODO(skal): tune.
  switch (preset) {
@@ -105,10 +107,14 @@ int WebPValidateConfig(const WebPConfig* const config) {
    return 0;
  if (config->partitions < 0 || config->partitions > 3)
    return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100)
+    return 0;
+  if (config->alpha_compression < 0)
+    return 0;
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/cost.c
+++ b/src/enc/cost.c
@@ -17,7 +17,7 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Boolean-cost cost table

 const uint16_t VP8EntropyCost[256] = {
@@ -49,13 +49,13 @@ const uint16_t VP8EntropyCost[256] = {
    10,    9,    7,    6,    4,    3
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Level cost tables

 // For each given level, the following table given the pattern of contexts
 // to use for coding it (in [][0]) as well as the bit value to use for
 // each context (in [][1]).
-static const uint16_t kLevelCodes[MAX_VARIABLE_LEVEL][2] = {
+const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
                  {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005},
  {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023},
  {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013},
@@ -337,8 +337,8 @@ const uint16_t VP8LevelFixedCosts[2048] = {
 };

 static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
-  int pattern = kLevelCodes[level - 1][0];
-  int bits = kLevelCodes[level - 1][1];
+  int pattern = VP8LevelCodes[level - 1][0];
+  int bits = VP8LevelCodes[level - 1][1];
  int cost = 0;
  int i;
  for (i = 2; pattern; ++i) {
@@ -351,7 +351,7 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
  return cost;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Pre-calc level costs once for all

 void VP8CalculateLevelCosts(VP8Proba* const proba) {
@@ -374,12 +374,13 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Mode cost tables.

 // These are the fixed probabilities (in the coding trees) turned into bit-cost
 // by calling VP8BitCost().
 const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
+// note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
@@ -484,7 +485,7 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/cost.h
+++ b/src/enc/cost.h
@@ -27,11 +27,13 @@ static inline int VP8BitCost(int bit, uint8_t proba) {
 }

 // Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability.
-static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total, uint8_t proba) {
+static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total,
+                                     uint8_t proba) {
  return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }

 // Level cost calculations
+extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8Proba* const proba);
 static inline int VP8LevelCost(const uint16_t* const table, int level) {
  return VP8LevelFixedCosts[level]
@@ -43,10 +45,10 @@ extern const uint16_t VP8FixedCostsUV[4];
 extern const uint16_t VP8FixedCostsI16[4];
 extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif  // WEBP_ENC_COST_H_
+#endif  /* WEBP_ENC_COST_H_ */
--- a/src/enc/enc.c
+++ b/src/enc/enc.c
@@ -16,7 +16,55 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static int ClipAlpha(int alpha) {
+  return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
+}
+
+int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
+  int num = 0, den = 0, val = 0;
+  int k;
+  int alpha;
+  // note: changing this loop to avoid the numerous "k + 1" slows things down.
+  for (k = 0; k < MAX_COEFF_THRESH; ++k) {
+    if (histo[k + 1]) {
+      val += histo[k + 1];
+      num += val * (k + 1);
+      den += (k + 1) * (k + 1);
+    }
+  }
+  // we scale the value to a usable [0..255] range
+  alpha = den ? 10 * num / den - 5 : 0;
+  return ClipAlpha(alpha);
+}
+
+static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                            int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
+  for (j = start_block; j < end_block; ++j) {
+    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 2;
+      out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
+    }
+
+    // Use bin to update histogram.
+    for (k = 0; k < 16; ++k) {
+      histo[out[k]]++;
+    }
+  }
+
+  return VP8GetAlpha(histo);
+}
+
+//------------------------------------------------------------------------------
 // run-time tables (~4k)

 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
@@ -39,7 +87,7 @@ static inline uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)

 #define STORE(x, y, v) \
@@ -49,7 +97,8 @@ static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)

-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
+static inline void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
@@ -81,6 +130,14 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) {
  }
 }

+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  int i;
  int tmp[16];
@@ -166,16 +223,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
  }
 }

-// default C implementations:
-VP8Idct VP8ITransform = ITransform;
-VP8Fdct VP8FTransform = FTransform;
-VP8WHT VP8ITransformWHT = ITransformWHT;
-VP8WHT VP8FTransformWHT = FTransformWHT;
-
 #undef MUL
 #undef STORE

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra predictions

 #define OUT(x, y) dst[(x) + (y) * BPS]
@@ -260,7 +311,7 @@ static inline void DCMode(uint8_t* dst, const uint8_t* left,
  Fill(dst, DC, size);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)

 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
@@ -280,7 +331,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  TrueMotion(C8TM8 + dst, left, top, 8);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)

 static void Intra16Preds(uint8_t* dst,
@@ -291,7 +342,7 @@ static void Intra16Preds(uint8_t* dst,
  TrueMotion(I16TM16 + dst, left, top, 16);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // luma 4x4 prediction

 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
@@ -478,12 +529,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  HU4(I4HU4 + dst, top);
 }

-// default C implementations
-VP8Intra4Preds VP8EncPredLuma4 = Intra4Preds;
-VP8IntraPreds VP8EncPredLuma16 = Intra16Preds;
-VP8IntraPreds VP8EncPredChroma8 = IntraChromaPreds;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Metric

 static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) {
@@ -513,22 +559,19 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  return GetSSE(a, b, 4, 4);
 }

-// default C implementations
-VP8Metric VP8SSE16x16 = SSE16x16;
-VP8Metric VP8SSE8x8 = SSE8x8;
-VP8Metric VP8SSE16x8 = SSE16x8;
-VP8Metric VP8SSE4x4 = SSE4x4;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.

 // Hadamard transform
-static void TTransform(const uint8_t* in, int16_t* out) {
+// Returns the weighted sum of the absolute value of transformed coefficients.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
  int tmp[16];
  int i;
+  // horizontal pass
  for (i = 0; i < 4; ++i, in += BPS) {
    const int a0 = (in[0] + in[2]) << 2;
    const int a1 = (in[1] + in[3]) << 2;
@@ -539,7 +582,8 @@ static void TTransform(const uint8_t* in, int16_t* out) {
    tmp[2 + i * 4] = a3 - a2;
    tmp[3 + i * 4] = a0 - a1;
  }
-  for (i = 0; i < 4; ++i) {
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
    const int a0 = (tmp[0 + i] + tmp[8 + i]);
    const int a1 = (tmp[4 + i] + tmp[12+ i]);
    const int a2 = (tmp[4 + i] - tmp[12+ i]);
@@ -548,24 +592,20 @@ static void TTransform(const uint8_t* in, int16_t* out) {
    const int b1 = a3 + a2;
    const int b2 = a3 - a2;
    const int b3 = a0 - a1;
-    out[ 0 + i] = (b0 + (b0 < 0) + 3) >> 3;
-    out[ 4 + i] = (b1 + (b1 < 0) + 3) >> 3;
-    out[ 8 + i] = (b2 + (b2 < 0) + 3) >> 3;
-    out[12 + i] = (b3 + (b3 < 0) + 3) >> 3;
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    sum += w[ 0] * ((abs(b0) + 3) >> 3);
+    sum += w[ 4] * ((abs(b1) + 3) >> 3);
+    sum += w[ 8] * ((abs(b2) + 3) >> 3);
+    sum += w[12] * ((abs(b3) + 3) >> 3);
  }
+  return sum;
 }

 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                    const uint16_t* const w) {
-  int16_t tmp1[16], tmp2[16];
-  int k;
-  int D;
-  TTransform(a, tmp1);
-  TTransform(b, tmp2);
-  D = 0;
-  for (k = 0; k < 16; ++k)
-    D += w[k] * (abs(tmp2[k]) - abs(tmp1[k]));
-  return (abs(D) + 8) >> 4;
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return (abs(sum2 - sum1) + 8) >> 4;
 }

 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
@@ -580,10 +620,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  return D;
 }

-VP8WMetric VP8TDisto4x4 = Disto4x4;
-VP8WMetric VP8TDisto16x16 = Disto16x16;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantization
 //

@@ -612,10 +649,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
  return (last >= 0);
 }

-// default C implementation
-VP8QuantizeBlock VP8EncQuantizeBlock = QuantizeBlock;
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Block copy

 static inline void Copy(const uint8_t* src, uint8_t* dst, int size) {
@@ -631,15 +665,104 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); }
 static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); }

-// default C implementations
-VP8BlockCopy VP8Copy4x4 = Copy4x4;
-VP8BlockCopy VP8Copy8x8 = Copy8x8;
-VP8BlockCopy VP8Copy16x16 = Copy16x16;
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//

-//-----------------------------------------------------------------------------
+#if defined(__pic__) && defined(__i386__)
+static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static inline void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(_MSC_VER)  // Visual C++
+#define GetCPUInfo __cpuid
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+  if (feature == kSSE3) {
+    return 0 != (cpu_info[2] & 0x00000001);
+  }
+  return 0;
+}
+VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo;
+#else
+VP8CPUInfo VP8EncGetCPUInfo = NULL;
+#endif
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8WHT VP8ITransformWHT;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy8x8;
+VP8BlockCopy VP8Copy16x16;
+
+extern void VP8EncDspInitSSE2(void);

 void VP8EncDspInit(void) {
  InitTables();
+
+  // default C implementations
+  VP8CollectHistogram = CollectHistogram;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8Copy4x4 = Copy4x4;
+  VP8Copy8x8 = Copy8x8;
+  VP8Copy16x16 = Copy16x16;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8EncGetCPUInfo) {
+    if (VP8EncGetCPUInfo(kSSE2)) {
+#if defined(__SSE2__) || defined(_MSC_VER)
+      VP8EncDspInitSSE2();
+#endif
+    }
+    if (VP8EncGetCPUInfo(kSSE3)) {
+      // later we'll plug some SSE3 variant here
+    }
+  }
 }

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/src/enc/enc_sse2.c
+++ b/src/enc/enc_sse2.c
@@ -0,0 +1,834 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of speed-critical functions.
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#if defined(__SSE2__) || defined(_MSC_VER)
+#include <emmintrin.h>
+
+#include "vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
+                                int start_block, int end_block) {
+  int histo[MAX_COEFF_THRESH + 1] = { 0 };
+  int16_t out[16];
+  int j, k;
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  for (j = start_block; j < end_block; ++j) {
+    VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign0 = _mm_srai_epi16(out0, 15);
+      const __m128i sign1 = _mm_srai_epi16(out1, 15);
+      // abs(out) = (out ^ sign) - sign
+      const __m128i xor0 = _mm_xor_si128(out0, sign0);
+      const __m128i xor1 = _mm_xor_si128(out1, sign1);
+      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
+      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
+      // v = abs(out) >> 2
+      const __m128i v0 = _mm_srai_epi16(abs0, 2);
+      const __m128i v1 = _mm_srai_epi16(abs1, 2);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Use bin to update histogram.
+    for (k = 0; k < 16; ++k) {
+      histo[out[k]]++;
+    }
+  }
+
+  return VP8GetAlpha(histo);
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Does one or two inverse transforms.
+static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two inverse
+  // transforms in parallel). In the case of only one inverse transform, the
+  // second half of the vectors will just contain random value we'll never
+  // use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'ref' and store.
+  {
+    const __m128i zero = _mm_set1_epi16(0);
+    // Load the reference(s).
+    __m128i ref0, ref1, ref2, ref3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
+      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
+      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
+      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
+    }
+    // Convert to 16b.
+    ref0 = _mm_unpacklo_epi8(ref0, zero);
+    ref1 = _mm_unpacklo_epi8(ref1, zero);
+    ref2 = _mm_unpacklo_epi8(ref2, zero);
+    ref3 = _mm_unpacklo_epi8(ref3, zero);
+    // Add the inverse transform(s).
+    ref0 = _mm_add_epi16(ref0, T0);
+    ref1 = _mm_add_epi16(ref1, T1);
+    ref2 = _mm_add_epi16(ref2, T2);
+    ref3 = _mm_add_epi16(ref3, T3);
+    // Unsigned saturate to 8b.
+    ref0 = _mm_packus_epi16(ref0, ref0);
+    ref1 = _mm_packus_epi16(ref1, ref1);
+    ref2 = _mm_packus_epi16(ref2, ref2);
+    ref3 = _mm_packus_epi16(ref3, ref3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
+    }
+  }
+}
+
+static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k7500 = _mm_set1_epi32(7500);
+  const __m128i k14500 = _mm_set1_epi32(14500);
+  const __m128i k51000 = _mm_set1_epi32(51000);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+
+  __m128i v01, v32;
+
+  // Difference between src and ref and initial transpose.
+  {
+    // Load src and convert to 16b.
+    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
+    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
+    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
+    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
+    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+    // Load ref and convert to 16b.
+    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
+    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
+    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
+    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+    // Compute difference.
+    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+    // Transpose.
+    // 00 01 02 03   0 0 0 0
+    // 10 11 12 13   0 0 0 0
+    // 20 21 22 23   0 0 0 0
+    // 30 31 32 33   0 0 0 0
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // a02 a12 a22 a32   a03 a13 a23 a33
+    // a00 a10 a20 a30   a01 a11 a21 a31
+    // a03 a13 a23 a33   a02 a12 a22 a32
+  }
+
+  // First pass and subsequent transpose.
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // b0 = (a0 + a3) << 3
+    // b1 = (a1 + a2) << 3
+    // b3 = (a0 - a3) << 3
+    // b2 = (a1 - a2) << 3
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i b01 = _mm_slli_epi16(a01, 3);
+    const __m128i b32 = _mm_slli_epi16(a32, 3);
+    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
+    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);
+
+    // e0 = b0 + b1
+    // e2 = b0 - b1
+    const __m128i e0 = _mm_add_epi16(b01, b11);
+    const __m128i e2 = _mm_sub_epi16(b01, b11);
+    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);
+
+    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
+    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
+    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k14500);
+    const __m128i d3 = _mm_add_epi32(c3, k7500);
+    const __m128i e1 = _mm_srai_epi32(d1, 12);
+    const __m128i e3 = _mm_srai_epi32(d3, 12);
+    const __m128i e13 = _mm_packs_epi32(e1, e3);
+
+    // Transpose.
+    // 00 01 02 03  20 21 22 23
+    // 10 11 12 13  30 31 32 33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
+    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
+    // 00 10 01 11   02 12 03 13
+    // 20 30 21 31   22 32 23 33
+    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
+    // 02 12 22 32   03 13 23 33
+    // 00 10 20 30   01 11 21 31
+    // 03 13 23 33   02 12 22 32
+  }
+
+  // Second pass
+  {
+    // Same operations are done on the (0,3) and (1,2) pairs.
+    // a0 = v0 + v3
+    // a1 = v1 + v2
+    // a3 = v0 - v3
+    // a2 = v1 - v2
+    const __m128i a01 = _mm_add_epi16(v01, v32);
+    const __m128i a32 = _mm_sub_epi16(v01, v32);
+    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+
+    // d0 = (a0 + a1 + 7) >> 4;
+    // d2 = (a0 - a1 + 7) >> 4;
+    const __m128i b0 = _mm_add_epi16(a01, a11);
+    const __m128i b2 = _mm_sub_epi16(a01, a11);
+    const __m128i c0 = _mm_add_epi16(b0, seven);
+    const __m128i c2 = _mm_add_epi16(b2, seven);
+    const __m128i d0 = _mm_srai_epi16(c0, 4);
+    const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+    const __m128i d3 = _mm_add_epi32(c3, k51000);
+    const __m128i e1 = _mm_srai_epi32(d1, 16);
+    const __m128i e3 = _mm_srai_epi32(d3, 16);
+    const __m128i f1 = _mm_packs_epi32(e1, e1);
+    const __m128i f3 = _mm_packs_epi32(e3, e3);
+    // f1 = f1 + (a3 != 0);
+    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+    // desired (0, 1), we add one earlier through k12000_plus_one.
+    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+    _mm_storel_epi64((__m128i*)&out[ 0], d0);
+    _mm_storel_epi64((__m128i*)&out[ 4], g1);
+    _mm_storel_epi64((__m128i*)&out[ 8], d2);
+    _mm_storel_epi64((__m128i*)&out[12], f3);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+  const __m128i zero = _mm_set1_epi16(0);
+
+  // Load values.
+  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
+
+  // Combine pair of lines and convert to 16b.
+  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
+  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
+  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
+  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
+  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
+
+  // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
+  // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
+  //                  need absolute values, there is no need to do calculation
+  //                  in 8bit as we are already in 16bit, ... Yet this is what
+  //                  benchmarks the fastest!
+  const __m128i d0 = _mm_subs_epu8(a01s, b01s);
+  const __m128i d1 = _mm_subs_epu8(b01s, a01s);
+  const __m128i d2 = _mm_subs_epu8(a23s, b23s);
+  const __m128i d3 = _mm_subs_epu8(b23s, a23s);
+
+  // Square and add them all together.
+  const __m128i madd0 = _mm_madd_epi16(d0, d0);
+  const __m128i madd1 = _mm_madd_epi16(d1, d1);
+  const __m128i madd2 = _mm_madd_epi16(d2, d2);
+  const __m128i madd3 = _mm_madd_epi16(d3, d3);
+  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+  int32_t tmp[4];
+  _mm_storeu_si128((__m128i*)tmp, sum2);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the difference between the weighted sum of the absolute value of
+// transformed coefficients.
+static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
+                          const uint16_t* const w) {
+  int32_t sum[4];
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i three = _mm_set1_epi16(3);
+
+  // Load, combine and tranpose inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
+    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
+    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
+    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
+    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
+
+    // Transpose the two 4x4, discarding the filling zeroes.
+    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
+    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
+    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
+    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
+    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
+
+    // Convert to 16b.
+    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
+    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
+    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
+    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
+    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
+    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
+    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
+    // b0_extra = (a0 != 0);
+    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
+    const __m128i b0_base = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Vertical pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    // TODO(cduvivier): Make variable declarations and allocations aligned so
+    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    {
+      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
+      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
+      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
+      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
+      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
+
+      // b = abs(b) = (b ^ sign) - sign
+      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
+      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
+      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
+      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
+      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
+      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
+      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
+      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
+    }
+
+    // b = abs(b) + 3
+    A_b0 = _mm_add_epi16(A_b0, three);
+    A_b2 = _mm_add_epi16(A_b2, three);
+    B_b0 = _mm_add_epi16(B_b0, three);
+    B_b2 = _mm_add_epi16(B_b2, three);
+
+    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
+    // b = (abs(b) + 3) >> 3
+    A_b0 = _mm_srai_epi16(A_b0, 3);
+    A_b2 = _mm_srai_epi16(A_b2, 3);
+    B_b0 = _mm_srai_epi16(B_b0, 3);
+    B_b2 = _mm_srai_epi16(B_b2, 3);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b0 = _mm_sub_epi32(A_b0, B_b0);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
+  }
+  return sum[0] + sum[1] + sum[2] + sum[3];
+}
+
+static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int diff_sum = TTransformSSE2(a, b, w);
+  return (abs(diff_sum) + 8) >> 4;
+}
+
+static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4SSE2(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Simple quantization
+static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
+                             int n, const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i sign0, sign8;
+  __m128i coeff0, coeff8;
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  // TODO(cduvivier): Make variable declarations and allocations aligned so that
+  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
+  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
+  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
+  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
+  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
+  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
+  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
+
+  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
+  sign0 = _mm_srai_epi16(in0, 15);
+  sign8 = _mm_srai_epi16(in8, 15);
+
+  // coeff = abs(in) = (in ^ sign) - sign
+  coeff0 = _mm_xor_si128(in0, sign0);
+  coeff8 = _mm_xor_si128(in8, sign8);
+  coeff0 = _mm_sub_epi16(coeff0, sign0);
+  coeff8 = _mm_sub_epi16(coeff8, sign8);
+
+  // coeff = abs(in) + sharpen
+  coeff0 = _mm_add_epi16(coeff0, sharpen0);
+  coeff8 = _mm_add_epi16(coeff8, sharpen8);
+
+  // if (coeff > 2047) coeff = 2047
+  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
+  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);
+
+  // out = (coeff * iQ + B) >> QFIX;
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // expand bias from 16b to 32b
+    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
+    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
+    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
+    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
+    // out = (coeff * iQ + B)
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = (coeff * iQ + B) >> QFIX;
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+  }
+
+  // get sign back (if (sign[j]) out_n = -out_n)
+  out0 = _mm_xor_si128(out0, sign0);
+  out8 = _mm_xor_si128(out8, sign8);
+  out0 = _mm_sub_epi16(out0, sign0);
+  out8 = _mm_sub_epi16(out8, sign8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
+  {
+    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
+    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
+    in0 = _mm_and_si128(in0, cmp0);
+    in8 = _mm_and_si128(in8, cmp8);
+    _mm_storeu_si128((__m128i*)&in[0], in0);
+    _mm_storeu_si128((__m128i*)&in[8], in8);
+    out0 = _mm_and_si128(out0, cmp0);
+    out8 = _mm_and_si128(out8, cmp8);
+  }
+
+  // zigzag the output before storing it.
+  //
+  // The zigzag pattern can almost be reproduced with a small sequence of
+  // shuffles. After it, we only need to swap the 7th (ending up in third
+  // position instead of twelfth) and 8th values.
+  {
+    __m128i outZ0, outZ8;
+    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
+    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
+    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
+    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
+    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
+    _mm_storeu_si128((__m128i*)&out[0], outZ0);
+    _mm_storeu_si128((__m128i*)&out[8], outZ8);
+    packed_out = _mm_packs_epi16(outZ0, outZ8);
+  }
+  {
+    const int16_t outZ_12 = out[12];
+    const int16_t outZ_3 = out[3];
+    out[3] = outZ_12;
+    out[12] = outZ_3;
+  }
+
+  // detect if all 'out' values are zeroes or not
+  {
+    int32_t tmp[4];
+    _mm_storeu_si128((__m128i*)tmp, packed_out);
+    if (n) {
+      tmp[0] &= ~0xff;
+    }
+    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
+  }
+}
+
+extern void VP8EncDspInitSSE2(void);
+void VP8EncDspInitSSE2(void) {
+  VP8CollectHistogram = CollectHistogramSSE2;
+  VP8EncQuantizeBlock = QuantizeBlockSSE2;
+  VP8ITransform = ITransformSSE2;
+  VP8FTransform = FTransformSSE2;
+  VP8SSE4x4 = SSE4x4SSE2;
+  VP8TDisto4x4 = Disto4x4SSE2;
+  VP8TDisto16x16 = Disto16x16SSE2;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif   //__SSE2__
--- a/src/enc/filter.c
+++ b/src/enc/filter.c
@@ -45,7 +45,7 @@ static void InitTables(void) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Edge filtering functions

 // 4 pixels in, 2 pixels out
@@ -92,7 +92,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) {
         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)

 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
@@ -129,7 +129,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)

 static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size,
@@ -177,7 +177,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
 void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
@@ -187,7 +187,7 @@ void (*VP8EncHFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
 void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
 void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength

 static int GetILevel(int sharpness, int level) {
@@ -229,7 +229,7 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // SSIM metric

 enum { KERNEL = 3 };
@@ -302,7 +302,7 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
  return GetSSIM(&s);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Exposed APIs: Encoder should call the following 3 functions to adjust
 // loop filter strength

--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -37,7 +37,7 @@ typedef struct {
  CostArray*  cost;
 } VP8Residual;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Tables for level coding

 const uint8_t VP8EncBands[16 + 1] = {
@@ -51,18 +51,16 @@ static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 };
 static const uint8_t kCat6[] =
    { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Reset the statistics about: number of skips, token proba, level cost,...

 static void ResetStats(VP8Encoder* const enc, int precalc_cost) {
  VP8Proba* const proba = &enc->proba_;
  if (precalc_cost) VP8CalculateLevelCosts(proba);
  proba->nb_skip_ = 0;
-  proba->nb_i4_ = 0;
-  proba->nb_i16_ = 0;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Skip decision probability

 static int CalcSkipProba(uint64_t nb, uint64_t total) {
@@ -86,7 +84,7 @@ static int FinalizeSkipProba(VP8Encoder* const enc) {
  return size;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Recording of token probabilities.

 static void ResetTokenStats(VP8Encoder* const enc) {
@@ -101,6 +99,9 @@ static int Record(int bit, uint64_t* const stats) {
  return bit;
 }

+// We keep the table free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
 // Simulate block coding, but only record statistics.
 // Note: no need to record the fixed probas.
 static int RecordCoeffs(int ctx, VP8Residual* res) {
@@ -111,14 +112,16 @@ static int RecordCoeffs(int ctx, VP8Residual* res) {
  }

  while (1) {
-    const int v = abs(res->coeffs[n++]);
+    int v = res->coeffs[n++];
    if (!Record(v != 0, s[1])) {
      s = res->stats[VP8EncBands[n]][0];
      continue;
    }
-    if (!Record(v > 1, s[2])) {
+    if (!Record(2u < (unsigned int)(v + 1), s[2])) {  // v = -1 or 1
      s = res->stats[VP8EncBands[n]][1];
    } else {
+      v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
      if (!Record(v > 4, s[3])) {
        if (Record(v != 2, s[4]))
          Record(v == 4, s[5]);
@@ -129,6 +132,20 @@ static int RecordCoeffs(int ctx, VP8Residual* res) {
      } else {
        Record((v >= 3 + (8 << 3)), s[10]);
      }
+#else
+      if (v > MAX_VARIABLE_LEVEL)
+        v = MAX_VARIABLE_LEVEL;
+
+      {
+        const int bits = VP8LevelCodes[v - 1][1];
+        int pattern = VP8LevelCodes[v - 1][0];
+        int i;
+        for (i = 0; (pattern >>= 1) != 0; ++i) {
+          const int mask = 2 << i;
+          if (pattern & 1) Record(!!(bits & mask), s[3 + i]);
+        }
+      }
+#endif
      s = res->stats[VP8EncBands[n]][2];
    }
    if (n == 16 || !Record(n <= res->last, s[0])) {
@@ -174,7 +191,7 @@ static int FinalizeTokenProbas(VP8Encoder* const enc) {
  return size;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.

 static void InitResidual(int first, int coeff_type,
@@ -199,7 +216,7 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
  res->coeffs = coeffs;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Mode costs

 static int GetResidualCost(int ctx, const VP8Residual* const res) {
@@ -213,16 +230,18 @@ static int GetResidualCost(int ctx, const VP8Residual* const res) {
    return cost;
  }
  while (n <= res->last) {
-    const int v = abs(res->coeffs[n++]);
-    cost += VP8LevelCost(t, v);
+    const int v = res->coeffs[n++];
    if (v == 0) {
+      cost += VP8LevelCost(t, 0);
      p = res->prob[VP8EncBands[n]][0];
      t = res->cost[VP8EncBands[n]][0];
      continue;
-    } else if (v == 1) {
+    } else if (2u >= (unsigned int)(v + 1)) {   // v = -1 or 1
+      cost += VP8LevelCost(t, 1);
      p = res->prob[VP8EncBands[n]][1];
      t = res->cost[VP8EncBands[n]][1];
    } else {
+      cost += VP8LevelCost(t, abs(v));
      p = res->prob[VP8EncBands[n]][2];
      t = res->cost[VP8EncBands[n]][2];
    }
@@ -292,7 +311,7 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
  return R;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Coefficient coding

 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
@@ -462,7 +481,7 @@ static void RecordResiduals(VP8EncIterator* const it,
  VP8IteratorBytesToNz(it);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // ExtraInfo map / Debug function

 #if SEGMENT_VISU
@@ -525,7 +544,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main loops
 //
 //  VP8EncLoop(): does the final bitstream coding.
@@ -568,6 +587,14 @@ int VP8EncLoop(VP8Encoder* const enc) {
    } else {   // reset predictors after a skip
      ResetAfterSkip(&it);
    }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (enc->has_alpha_) {
+      VP8EncCodeAlphaBlock(&it);
+    }
+    if (enc->use_layer_) {
+      VP8EncCodeLayerBlock(&it);
+    }
+#endif
    StoreSideInfo(&it);
    VP8StoreFilterStats(&it);
    VP8IteratorExport(&it);
@@ -589,7 +616,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 //  VP8StatLoop(): only collect statistics (number of skips, token usage, ...)
 //                 This is used for deciding optimal probabilities. It also
 //                 modifies the quantizer value if some target (size, PNSR)
@@ -664,7 +691,7 @@ int VP8StatLoop(VP8Encoder* const enc) {
  }

  // binary search for a size close to target
-  for (pass = 0; pass < enc->config_->pass || (dqs[pass] > 0); ++pass) {
+  for (pass = 0; pass < enc->config_->pass && (dqs[pass] > 0); ++pass) {
    const int rd_opt = 1;
    float PSNR;
    int criterion;
@@ -688,7 +715,7 @@ int VP8StatLoop(VP8Encoder* const enc) {
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/iterator.c
+++ b/src/enc/iterator.c
@@ -17,9 +17,9 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Iterator
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static void InitLeft(VP8EncIterator* const it) {
  const VP8Encoder* const enc = it->enc_;
@@ -68,7 +68,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
  VP8IteratorReset(it);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.

@@ -122,7 +122,7 @@ void VP8IteratorImport(const VP8EncIterator* const it) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Copy back the compressed samples into user space if requested.

 void VP8IteratorExport(const VP8EncIterator* const it) {
@@ -148,16 +148,18 @@ void VP8IteratorExport(const VP8EncIterator* const it) {
      memcpy(ydst + i * pic->y_stride, ysrc + i * BPS, w);
    }
    // U/V plane
-    w = (w + 1) / 2;
-    h = (h + 1) / 2;
-    for (i = 0; i < h; ++i) {
-      memcpy(udst + i * pic->uv_stride, usrc + i * BPS, w);
-      memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, w);
+    {
+      const int uv_w = (w + 1) / 2;
+      const int uv_h = (h + 1) / 2;
+      for (i = 0; i < uv_h; ++i) {
+        memcpy(udst + i * pic->uv_stride, usrc + i * BPS, uv_w);
+        memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, uv_w);
+      }
    }
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Non-zero contexts setup/teardown

 // Nz bits:
@@ -214,7 +216,8 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
  nz |= (it->top_nz_[6] << 22) | (it->top_nz_[7] << 23);
  nz |= (it->top_nz_[8] << 24);  // we propagate the _top_ bit, esp. for intra4
  // left
-  nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7) | (it->left_nz_[2] << 11);
+  nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7);
+  nz |= (it->left_nz_[2] << 11);
  nz |= (it->left_nz_[4] << 17) | (it->left_nz_[6] << 21);

  *it->nz_ = nz;
@@ -222,7 +225,7 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {

 #undef BIT

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Advance to the next position, doing the bookeeping.

 int VP8IteratorNext(VP8EncIterator* const it,
@@ -267,7 +270,7 @@ int VP8IteratorNext(VP8EncIterator* const it,
  return (0 < --it->done_);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Helper function to set mode properties

 void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) {
@@ -304,7 +307,7 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment) {
  it->mb_->segment_ = segment;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Intra4x4 sub-blocks iteration
 //
 //  We store and update the boundary samples into an array of 37 pixels. They
@@ -399,7 +402,7 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/layer.c
+++ b/src/enc/layer.c
@@ -0,0 +1,55 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Enhancement layer (for YUV444/422)
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+//------------------------------------------------------------------------------
+
+void VP8EncInitLayer(VP8Encoder* const enc) {
+  enc->use_layer_ = (enc->pic_->u0 != NULL);
+  enc->layer_data_size_ = 0;
+  enc->layer_data_ = NULL;
+  if (enc->use_layer_) {
+    VP8BitWriterInit(&enc->layer_bw_, enc->mb_w_ * enc->mb_h_ * 3);
+  }
+}
+
+void VP8EncCodeLayerBlock(VP8EncIterator* it) {
+  (void)it;   // remove a warning
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+}
+
+int VP8EncFinishLayer(VP8Encoder* const enc) {
+  if (enc->use_layer_) {
+    enc->layer_data_ = VP8BitWriterFinish(&enc->layer_bw_);
+    enc->layer_data_size_ = VP8BitWriterSize(&enc->layer_bw_);
+  }
+  return 1;
+}
+
+void VP8EncDeleteLayer(VP8Encoder* enc) {
+  free(enc->layer_data_);
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/enc/picture.c
+++ b/src/enc/picture.c
@@ -9,6 +9,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <assert.h>
 #include <stdlib.h>
 #include "vp8enci.h"

@@ -16,54 +17,122 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPPicture
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int WebPPictureAlloc(WebPPicture* const picture) {
  if (picture) {
+    const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
+    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
    const int width = picture->width;
    const int height = picture->height;
+    const int y_stride = width;
    const int uv_width = (width + 1) / 2;
    const int uv_height = (height + 1) / 2;
-    const uint64_t y_size = (uint64_t)width * height;
-    const uint64_t uv_size = (uint64_t)uv_width * uv_height;
-    const uint64_t total_size = y_size + 2 * uv_size;
+    const int uv_stride = uv_width;
+    int uv0_stride = 0;
+    int a_width, a_stride;
+    uint64_t y_size, uv_size, uv0_size, a_size, total_size;
+    uint8_t* mem;
+
+    // U/V
+    switch (uv_csp) {
+      case WEBP_YUV420:
+        break;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      case WEBP_YUV400:    // for now, we'll just reset the U/V samples
+        break;
+      case WEBP_YUV422:
+        uv0_stride = uv_width;
+        break;
+      case WEBP_YUV444:
+        uv0_stride = width;
+        break;
+#endif
+      default:
+        return 0;
+    }
+    uv0_size = height * uv0_stride;
+
+    // alpha
+    a_width = has_alpha ? width : 0;
+    a_stride = a_width;
+    y_size = (uint64_t)y_stride * height;
+    uv_size = (uint64_t)uv_stride * uv_height;
+    a_size =  (uint64_t)a_stride * height;
+
+    total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
+
    // Security and validation checks
-    if (uv_width <= 0 || uv_height <= 0 ||   // check param error
+    if (width <= 0 || height <= 0 ||       // check for luma/alpha param error
+        uv_width < 0 || uv_height < 0 ||   // check for u/v param error
        y_size >= (1ULL << 40) ||            // check for reasonable global size
        (size_t)total_size != total_size) {  // check for overflow on 32bit
      return 0;
    }
-    picture->y_stride = width;
-    picture->uv_stride = uv_width;
+    picture->y_stride  = y_stride;
+    picture->uv_stride = uv_stride;
+    picture->a_stride  = a_stride;
+    picture->uv0_stride  = uv0_stride;
    WebPPictureFree(picture);   // erase previous buffer
-    picture->y = (uint8_t*)malloc((size_t)total_size);
-    if (picture->y == NULL) return 0;
-    picture->u = picture->y + y_size;
-    picture->v = picture->u + uv_size;
+    mem = (uint8_t*)malloc((size_t)total_size);
+    if (mem == NULL) return 0;
+
+    picture->y = mem;
+    mem += y_size;
+
+    picture->u = mem;
+    mem += uv_size;
+    picture->v = mem;
+    mem += uv_size;
+
+    if (a_size) {
+      picture->a = mem;
+      mem += a_size;
+    }
+    if (uv0_size) {
+      picture->u0 = mem;
+      mem += uv0_size;
+      picture->v0 = mem;
+      mem += uv0_size;
+    }
  }
  return 1;
 }

+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory. 'src' can be NULL.
+static void WebPPictureGrabSpecs(const WebPPicture* const src,
+                                 WebPPicture* const dst) {
+  if (src) *dst = *src;
+  dst->y = dst->u = dst->v = NULL;
+  dst->u0 = dst->v0 = NULL;
+  dst->a = NULL;
+}
+
+// Release memory owned by 'picture'.
 void WebPPictureFree(WebPPicture* const picture) {
  if (picture) {
    free(picture->y);
-    picture->y = picture->u = picture->v = NULL;
+    WebPPictureGrabSpecs(NULL, picture);
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Picture copying

 int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
  int y;
  if (src == NULL || dst == NULL) return 0;
  if (src == dst) return 1;
-  *dst = *src;
-  dst->y = NULL;
+
+  WebPPictureGrabSpecs(src, dst);
  if (!WebPPictureAlloc(dst)) return 0;
+
  for (y = 0; y < dst->height; ++y) {
-    memcpy(dst->y + y * dst->y_stride, src->y + y * src->y_stride, src->width);
+    memcpy(dst->y + y * dst->y_stride,
+           src->y + y * src->y_stride, src->width);
  }
  for (y = 0; y < (dst->height + 1) / 2; ++y) {
    memcpy(dst->u + y * dst->uv_stride,
@@ -71,9 +140,32 @@ int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) {
    memcpy(dst->v + y * dst->uv_stride,
           src->v + y * src->uv_stride, (src->width + 1) / 2);
  }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (dst->a != NULL)  {
+    for (y = 0; y < dst->height; ++y) {
+      memcpy(dst->a + y * dst->a_stride,
+             src->a + y * src->a_stride, src->width);
+    }
+  }
+  if (dst->u0 != NULL)  {
+    int uv0_width = src->width;
+    if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
+      uv0_width = (uv0_width + 1) / 2;
+    }
+    for (y = 0; y < dst->height; ++y) {
+      memcpy(dst->u0 + y * dst->uv0_stride,
+             src->u0 + y * src->uv0_stride, uv0_width);
+      memcpy(dst->v0 + y * dst->uv0_stride,
+             src->v0 + y * src->uv0_stride, uv0_width);
+    }
+  }
+#endif
  return 1;
 }

+//------------------------------------------------------------------------------
+// Picture cropping
+
 int WebPPictureCrop(WebPPicture* const pic,
                    int left, int top, int width, int height) {
  WebPPicture tmp;
@@ -84,8 +176,7 @@ int WebPPictureCrop(WebPPicture* const pic,
  if (left < 0 || ((left + width + 1) & ~1) > pic->width) return 0;
  if (top < 0 || ((top + height + 1) & ~1) > pic->height) return 0;

-  tmp = *pic;
-  tmp.y = NULL;
+  WebPPictureGrabSpecs(pic, &tmp);
  tmp.width = width;
  tmp.height = height;
  if (!WebPPictureAlloc(&tmp)) return 0;
@@ -99,12 +190,189 @@ int WebPPictureCrop(WebPPicture* const pic,
    memcpy(tmp.u + y * tmp.uv_stride, pic->u + offset, (width + 1) / 2);
    memcpy(tmp.v + y * tmp.uv_stride, pic->v + offset, (width + 1) / 2);
  }
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (tmp.a) {
+    for (y = 0; y < height; ++y) {
+      memcpy(tmp.a + y * tmp.a_stride,
+           pic->a + (top + y) * pic->a_stride + left, width);
+    }
+  }
+  if (tmp.u0) {
+    int w = width;
+    int l = left;
+    if (tmp.colorspace == WEBP_YUV422) {
+      w = (w + 1) / 2;
+      l = (l + 1) / 2;
+    }
+    for (y = 0; y < height; ++y) {
+      memcpy(tmp.u0 + y * tmp.uv0_stride,
+             pic->u0 + (top + y) * pic->uv0_stride + l, w);
+      memcpy(tmp.v0 + y * tmp.uv0_stride,
+             pic->v0 + (top + y) * pic->uv0_stride + l, w);
+    }
+  }
+#endif
+
  WebPPictureFree(pic);
  *pic = tmp;
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+#define RFIX 30
+#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
+static inline void ImportRow(const uint8_t* src, int src_width,
+                             int32_t* frow, int32_t* irow, int dst_width) {
+  const int x_expand = (src_width < dst_width);
+  const int fx_scale = (1 << RFIX) / dst_width;
+  int x_in = 0;
+  int x_out;
+  int x_accum = 0;
+  if (!x_expand) {
+    int sum = 0;
+    for (x_out = 0; x_out < dst_width; ++x_out) {
+      x_accum += src_width - dst_width;
+      for (; x_accum > 0; x_accum -= dst_width) {
+        sum += src[x_in++];
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in++];
+        const int32_t frac = base * (-x_accum);
+        frow[x_out] = (sum + base) * dst_width - frac;
+        sum = MULT(frac, fx_scale);    // fresh fractional start for next pixel
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[0], right = src[0];
+    for (x_out = 0; x_out < dst_width; ++x_out) {
+      if (x_accum < 0) {
+        left = right;
+        right = src[++x_in];
+        x_accum += dst_width - 1;
+      }
+      frow[x_out] = right * (dst_width - 1) + (left - right) * x_accum;
+      x_accum -= src_width - 1;
+    }
+  }
+  // Accumulate the new row's contribution
+  for (x_out = 0; x_out < dst_width; ++x_out) {
+    irow[x_out] += frow[x_out];
+  }
+}
+
+static void ExportRow(int32_t* frow, int32_t* irow, uint8_t* dst, int dst_width,
+                      const int yscale, const int64_t fxy_scale) {
+  int x_out;
+  for (x_out = 0; x_out < dst_width; ++x_out) {
+    const int frac = MULT(frow[x_out], yscale);
+    const int v = (int)(MULT(irow[x_out] - frac, fxy_scale));
+    dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+    irow[x_out] = frac;   // new fractional start
+  }
+}
+
+static void RescalePlane(const uint8_t* src,
+                         int src_width, int src_height, int src_stride,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int32_t* const work) {
+  const int x_expand = (src_width < dst_width);
+  const int fy_scale = (1 << RFIX) / dst_height;
+  const int64_t fxy_scale = x_expand ?
+      ((int64_t)dst_height << RFIX) / (dst_width * src_height) :
+      ((int64_t)dst_height << RFIX) / (src_width * src_height);
+  int y_accum = src_height;
+  int y;
+  int32_t* irow = work;              // integral contribution
+  int32_t* frow = work + dst_width;  // fractional contribution
+
+  memset(work, 0, 2 * dst_width * sizeof(*work));
+  for (y = 0; y < src_height; ++y) {
+    // import new contribution of one source row.
+    ImportRow(src, src_width, frow, irow, dst_width);
+    src += src_stride;
+    // emit output row(s)
+    y_accum -= dst_height;
+    for (; y_accum <= 0; y_accum += src_height) {
+      const int yscale = fy_scale * (-y_accum);
+      ExportRow(frow, irow, dst, dst_width, yscale, fxy_scale);
+      dst += dst_stride;
+    }
+  }
+}
+#undef MULT
+#undef RFIX
+
+int WebPPictureRescale(WebPPicture* const pic, int width, int height) {
+  WebPPicture tmp;
+  int prev_width, prev_height;
+  int32_t* work;
+
+  if (pic == NULL) return 0;
+  prev_width = pic->width;
+  prev_height = pic->height;
+  // if width is unspecified, scale original proportionally to height ratio.
+  if (width == 0) {
+    width = (prev_width * height + prev_height / 2) / prev_height;
+  }
+  // if height is unspecified, scale original proportionally to width ratio.
+  if (height == 0) {
+    height = (prev_height * width + prev_width / 2) / prev_width;
+  }
+  // Check if the overall dimensions still make sense.
+  if (width <= 0 || height <= 0) return 0;
+
+  WebPPictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  work = malloc(2 * width * sizeof(int32_t));
+  if (work == NULL) {
+    WebPPictureFree(&tmp);
+    return 0;
+  }
+
+  RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+               tmp.y, width, height, tmp.y_stride, work);
+  RescalePlane(pic->u,
+               (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
+               tmp.u,
+               (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
+  RescalePlane(pic->v,
+               (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride,
+               tmp.v,
+               (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work);
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (tmp.a) {
+    RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                 tmp.a, width, height, tmp.a_stride, work);
+  }
+  if (tmp.u0) {
+    int s = 1;
+    if ((tmp.colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) {
+      s = 2;
+    }
+    RescalePlane(
+        pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
+        tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work);
+    RescalePlane(
+        pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
+        tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work);
+  }
+#endif
+
+  WebPPictureFree(pic);
+  free(work);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
 // Write-to-memory

 typedef struct {
@@ -150,7 +418,7 @@ static int WebPMemoryWrite(const uint8_t* data, size_t data_size,
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // RGB -> YUV conversion
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
 // More information at: http://en.wikipedia.org/wiki/YCbCr
@@ -196,36 +464,98 @@ static inline int rgb_to_v(int r, int g, int b) {
  picture->v[dst] = rgb_to_v(r, g, b);                   \
 }

+#define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
+  const int src = (step * (x_in) + (y) * rgb_stride);    \
+  const int dst = (x_out) + (y) * picture->uv0_stride;   \
+  const int r = SUM(r_ptr + src);                        \
+  const int g = SUM(g_ptr + src);                        \
+  const int b = SUM(b_ptr + src);                        \
+  picture->u0[dst] = rgb_to_u(r, g, b);                  \
+  picture->v0[dst] = rgb_to_v(r, g, b);                  \
+}
+
+static void MakeGray(WebPPicture* const picture) {
+  int y;
+  const int uv_width =  (picture->width + 1) >> 1;
+  for (y = 0; y < ((picture->height + 1) >> 1); ++y) {
+    memset(picture->u + y * picture->uv_stride, 128, uv_width);
+    memset(picture->v + y * picture->uv_stride, 128, uv_width);
+  }
+}
+
 static int Import(WebPPicture* const picture,
                  const uint8_t* const rgb, int rgb_stride,
-                  int step, int swap) {
+                  int step, int swap_rb, int import_alpha) {
+  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
  int x, y;
-  const uint8_t* const r_ptr = rgb + (swap ? 2 : 0);
+  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap ? 0 : 2);
+  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
+  const int width = picture->width;
+  const int height = picture->height;

-  for (y = 0; y < picture->height; ++y) {
-    for (x = 0; x < picture->width; ++x) {
+  // Import luma plane
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
      const int offset = step * x + y * rgb_stride;
      picture->y[x + y * picture->y_stride] =
        rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
    }
  }
-  for (y = 0; y < (picture->height >> 1); ++y) {
-    for (x = 0; x < (picture->width >> 1); ++x) {
-      RGB_TO_UV(x, y, SUM4);
+
+  // Downsample U/V plane
+  if (uv_csp != WEBP_YUV400) {
+    for (y = 0; y < (height >> 1); ++y) {
+      for (x = 0; x < (width >> 1); ++x) {
+        RGB_TO_UV(x, y, SUM4);
+      }
+      if (picture->width & 1) {
+        RGB_TO_UV(x, y, SUM2V);
+      }
    }
-    if (picture->width & 1) {
-      RGB_TO_UV(x, y, SUM2V);
+    if (height & 1) {
+      for (x = 0; x < (width >> 1); ++x) {
+        RGB_TO_UV(x, y, SUM2H);
+      }
+      if (width & 1) {
+        RGB_TO_UV(x, y, SUM1);
+      }
    }
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    // Store original U/V samples too
+    if (uv_csp == WEBP_YUV422) {
+      for (y = 0; y < height; ++y) {
+        for (x = 0; x < (width >> 1); ++x) {
+          RGB_TO_UV0(2 * x, x, y, SUM2H);
+        }
+        if (width & 1) {
+          RGB_TO_UV0(2 * x, x, y, SUM1);
+        }
+      }
+    } else if (uv_csp == WEBP_YUV444) {
+      for (y = 0; y < height; ++y) {
+        for (x = 0; x < width; ++x) {
+          RGB_TO_UV0(x, x, y, SUM1);
+        }
+      }
+    }
+#endif
+  } else {
+    MakeGray(picture);
  }
-  if (picture->height & 1) {
-    for (x = 0; x < (picture->width >> 1); ++x) {
-      RGB_TO_UV(x, y, SUM2H);
-    }
-    if (picture->width & 1) {
-      RGB_TO_UV(x, y, SUM1);
+
+  if (import_alpha) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    const uint8_t* const a_ptr = rgb + 3;
+    assert(step >= 4);
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        picture->a[x + y * picture->a_stride] =
+          a_ptr[step * x + y * rgb_stride];
+      }
    }
+#endif
  }
  return 1;
 }
@@ -237,34 +567,38 @@ static int Import(WebPPicture* const picture,

 int WebPPictureImportRGB(WebPPicture* const picture,
                         const uint8_t* const rgb, int rgb_stride) {
+  picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
  if (!WebPPictureAlloc(picture)) return 0;
-  return Import(picture, rgb, rgb_stride, 3, 0);
+  return Import(picture, rgb, rgb_stride, 3, 0, 0);
 }

 int WebPPictureImportBGR(WebPPicture* const picture,
                         const uint8_t* const rgb, int rgb_stride) {
+  picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
  if (!WebPPictureAlloc(picture)) return 0;
-  return Import(picture, rgb, rgb_stride, 3, 1);
+  return Import(picture, rgb, rgb_stride, 3, 1, 0);
 }

 int WebPPictureImportRGBA(WebPPicture* const picture,
                          const uint8_t* const rgba, int rgba_stride) {
+  picture->colorspace |= WEBP_CSP_ALPHA_BIT;
  if (!WebPPictureAlloc(picture)) return 0;
-  return Import(picture, rgba, rgba_stride, 4, 0);
+  return Import(picture, rgba, rgba_stride, 4, 0, 1);
 }

 int WebPPictureImportBGRA(WebPPicture* const picture,
                          const uint8_t* const rgba, int rgba_stride) {
+  picture->colorspace |= WEBP_CSP_ALPHA_BIT;
  if (!WebPPictureAlloc(picture)) return 0;
-  return Import(picture, rgba, rgba_stride, 4, 1);
+  return Import(picture, rgba, rgba_stride, 4, 1, 1);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Simplest call:

 typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int);

-static size_t Encode(const uint8_t* rgb, int width, int height, int stride,
+static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
                     Importer import, float quality_factor, uint8_t** output) {
  size_t output_size = 0;
  WebPPicture pic;
@@ -286,7 +620,7 @@ static size_t Encode(const uint8_t* rgb, int width, int height, int stride,
  wrt.size = &output_size;
  InitMemoryWriter(&wrt);

-  ok = import(&pic, rgb, stride) && WebPEncode(&config, &pic);
+  ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
  WebPPictureFree(&pic);
  if (!ok) {
    free(*output);
@@ -309,7 +643,7 @@ ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);

 #undef ENCODE_FUNC

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/quant.c
+++ b/src/enc/quant.c
@@ -33,13 +33,13 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static inline int clip(int v, int m, int M) {
  return v < m ? m : v > M ? M : v;
 }

-const uint8_t VP8Zigzag[16] = {
+static const uint8_t kZigzag[16] = {
  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
 };

@@ -132,7 +132,7 @@ static const uint8_t kFreqSharpening[16] = {
  90, 90, 90, 90
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Initialize quantization parameters in VP8Matrix

 // Returns the average quantizer
@@ -143,7 +143,7 @@ static int ExpandMatrix(VP8Matrix* const m, int type) {
    m->q_[i] = m->q_[1];
  }
  for (i = 0; i < 16; ++i) {
-    const int j = VP8Zigzag[i];
+    const int j = kZigzag[i];
    const int bias = kBiasMatrices[type][j];
    m->iq_[j] = (1 << QFIX) / m->q_[j];
    m->bias_[j] = BIAS(bias);
@@ -192,7 +192,7 @@ static void SetupMatrices(VP8Encoder* enc) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Initialize filtering parameters

 // Very small filter-strength values have close to no visual effect. So we can
@@ -214,7 +214,7 @@ static void SetupFilterStrength(VP8Encoder* const enc) {
  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 // Note: if you change the values below, remember that the max range
 // allowed by the syntax for DQ_UV is [-16,16].
@@ -286,7 +286,7 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Form the predictions in cache

 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
@@ -316,7 +316,7 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Quantize

 // Layout:
@@ -341,7 +341,7 @@ const int VP8Scan[16 + 4 + 4] = {
  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Distortion measurement

 static const uint16_t kWeightY[16] = {
@@ -384,7 +384,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
  dst->score += src->score;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Performs trellis-optimized quantization.

 // Trellis
@@ -440,7 +440,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    // compute maximal distortion.
    max_error = 0;
    for (n = first; n < 16; ++n) {
-      const int j  = VP8Zigzag[n];
+      const int j  = kZigzag[n];
      const int err = in[j] * in[j];
      max_error += kWeightTrellis[j] * err;
      if (err > thresh) last = n;
@@ -464,7 +464,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,

  // traverse trellis.
  for (n = first; n <= last; ++n) {
-    const int j  = VP8Zigzag[n];
+    const int j  = kZigzag[n];
    const int Q  = mtx->q_[j];
    const int iQ = mtx->iq_[j];
    const int B = BIAS(0x00);     // neutral bias
@@ -560,7 +560,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,

  for (; n >= first; --n) {
    const Node* const node = &NODE(n, best_node);
-    const int j = VP8Zigzag[n];
+    const int j = kZigzag[n];
    out[n] = node->sign ? -node->level : node->level;
    nz |= (node->level != 0);
    in[j] = out[n] * mtx->q_[j];
@@ -571,7 +571,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,

 #undef NODE

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Performs: difference, transform, quantize, back-transform, add
 // all at once. Output is the reconstructed block in *yuv_out, and the
 // quantized levels in *levels.
@@ -615,8 +615,8 @@ static int ReconstructIntra16(VP8EncIterator* const it,

  // Transform back
  VP8ITransformWHT(dc_tmp, tmp[0]);
-  for (n = 0; n < 16; ++n) {
-    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n]);
+  for (n = 0; n < 16; n += 2) {
+    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
  }

  return nz;
@@ -642,7 +642,7 @@ static int ReconstructIntra4(VP8EncIterator* const it,
  } else {
    nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
  }
-  VP8ITransform(ref, tmp, yuv_out);
+  VP8ITransform(ref, tmp, yuv_out, 0);
  return nz;
 }

@@ -666,8 +666,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
        for (x = 0; x < 2; ++x, ++n) {
          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
          const int non_zero =
-            TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, &dqm->uv_,
-                                 dqm->lambda_trellis_uv_);
+            TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
+                                 &dqm->uv_, dqm->lambda_trellis_uv_);
          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
          nz |= non_zero << n;
        }
@@ -679,13 +679,13 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    }
  }

-  for (n = 0; n < 8; ++n) {
-    VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n]);
+  for (n = 0; n < 8; n += 2) {
+    VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
  }
  return (nz << 16);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
 // Pick the mode is lower RD-cost = Rate + lamba * Distortion.

@@ -738,7 +738,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
  VP8SetIntra16Mode(it, rd->mode_i16);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 // return the cost array corresponding to the surrounding prediction modes.
 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
@@ -757,10 +757,15 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
  const int tlambda = dqm->tlambda_;
  const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
+  int total_header_bits = 0;
  VP8ModeScore rd_best;

+  if (enc->max_i4_header_bits_ == 0) {
+    return 0;
+  }
+
  InitScore(&rd_best);
-  rd_best.score = 0;
+  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
  VP8IteratorStartI4(it);
  do {
    VP8ModeScore rd_i4;
@@ -799,7 +804,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    }
    SetRDScore(dqm->lambda_mode_, &rd_i4);
    AddScore(&rd_best, &rd_i4);
-    if (rd_best.score >= rd->score) {
+    total_header_bits += mode_costs[best_mode];
+    if (rd_best.score >= rd->score ||
+        total_header_bits > enc->max_i4_header_bits_) {
      return 0;
    }
    // Copy selected samples if not in the right place already.
@@ -817,7 +824,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
  return 1;   // select intra4x4 over intra16x16
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  VP8Encoder* const enc = it->enc_;
@@ -855,7 +862,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
  AddScore(rd, &rd_best);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Final reconstruction and quantization.

 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
@@ -882,7 +889,7 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
  rd->nz = nz;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Entry point

 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) {
--- a/src/enc/syntax.c
+++ b/src/enc/syntax.c
@@ -26,7 +26,7 @@ extern "C" {
 #define MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
 #define MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Writers for header's various pieces (in order of appearance)

 // Main keyframe header
@@ -39,26 +39,31 @@ static void PutLE32(uint8_t* const data, uint32_t val) {
 }

 static int PutHeader(int profile, size_t size0, size_t total_size,
-                     const WebPPicture* const pic) {
+                     WebPPicture* const pic) {
  uint8_t buf[KHEADER_SIZE];
  uint8_t RIFF[KRIFF_SIZE] = {
    'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P', 'V', 'P', '8', ' '
  };
  uint32_t bits;

-  if (size0 >= MAX_PARTITION0_SIZE) {
-    return 0;   // partition #0 is too big to fit
+  if (size0 >= MAX_PARTITION0_SIZE) {  // partition #0 is too big to fit
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION0_OVERFLOW);
  }

-  PutLE32(RIFF + 4, total_size + KSIZE_OFFSET);
-  PutLE32(RIFF + 16, total_size);
-  if (!pic->writer(RIFF, sizeof(RIFF), pic))
-    return 0;
+  if (total_size > 0xfffffffeU - KRIFF_SIZE) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG);
+  }

-  bits = 0               // keyframe (1b)
-       | (profile << 1)  // profile (3b)
-       | (1 << 4)        // visible (1b)
-       | (size0 << 5);   // partition length (19b)
+  PutLE32(RIFF + 4, (uint32_t)(total_size + KSIZE_OFFSET));
+  PutLE32(RIFF + 16, (uint32_t)total_size);
+  if (!pic->writer(RIFF, sizeof(RIFF), pic)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE);
+  }
+
+  bits = 0                         // keyframe (1b)
+       | (profile << 1)            // profile (3b)
+       | (1 << 4)                  // visible (1b)
+       | ((uint32_t)size0 << 5);   // partition length (19b)
  buf[0] = bits & 0xff;
  buf[1] = (bits >> 8) & 0xff;
  buf[2] = (bits >> 16) & 0xff;
@@ -138,13 +143,13 @@ static void PutQuant(VP8BitWriter* const bw,

 // Partition sizes
 static int EmitPartitionsSize(const VP8Encoder* const enc,
-                              const WebPPicture* const pic) {
+                              WebPPicture* const pic) {
  uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)];
  int p;
  for (p = 0; p < enc->num_parts_ - 1; ++p) {
    const size_t part_size = VP8BitWriterSize(enc->parts_ + p);
    if (part_size >= MAX_PARTITION_SIZE) {
-      return 0;     // partition is too big to fit
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW);
    }
    buf[3 * p + 0] = (part_size >>  0) & 0xff;
    buf[3 * p + 1] = (part_size >>  8) & 0xff;
@@ -153,16 +158,69 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
  return p ? pic->writer(buf, 3 * p, pic) : 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+
+#define KTRAILER_SIZE 8
+
+static void PutLE24(uint8_t* buf, size_t value) {
+  buf[0] = (value >>  0) & 0xff;
+  buf[1] = (value >>  8) & 0xff;
+  buf[2] = (value >> 16) & 0xff;
+}
+
+static int WriteExtensions(VP8Encoder* const enc) {
+  uint8_t buffer[KTRAILER_SIZE];
+  VP8BitWriter* const bw = &enc->bw_;
+  WebPPicture* const pic = enc->pic_;
+
+  // Layer (bytes 0..3)
+  PutLE24(buffer + 0, enc->layer_data_size_);
+  buffer[3] = enc->pic_->colorspace & WEBP_CSP_UV_MASK;
+  if (enc->layer_data_size_ > 0) {
+    assert(enc->use_layer_);
+    // append layer data to last partition
+    if (!VP8BitWriterAppend(&enc->parts_[enc->num_parts_ - 1],
+                            enc->layer_data_, enc->layer_data_size_)) {
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
+    }
+  }
+  // Alpha (bytes 4..6)
+  PutLE24(buffer + 4, enc->alpha_data_size_);
+  if (enc->alpha_data_size_ > 0) {
+    assert(enc->has_alpha_);
+    if (!VP8BitWriterAppend(bw, enc->alpha_data_, enc->alpha_data_size_)) {
+      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
+    }
+  }
+
+  buffer[KTRAILER_SIZE - 1] = 0x01;  // marker
+  if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
+  }
+  return 1;
+}
+
+#endif    /* WEBP_EXPERIMENTAL_FEATURES */
+
+//------------------------------------------------------------------------------

 static size_t GeneratePartition0(VP8Encoder* const enc) {
  VP8BitWriter* const bw = &enc->bw_;
  const int mb_size = enc->mb_w_ * enc->mb_h_;
  uint64_t pos1, pos2, pos3;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  const int need_extensions = enc->has_alpha_ || enc->use_layer_;
+#endif

  pos1 = VP8BitWriterPos(bw);
  VP8BitWriterInit(bw, mb_size * 7 / 8);        // ~7 bits per macroblock
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  VP8PutBitUniform(bw, need_extensions);   // extensions
+#else
  VP8PutBitUniform(bw, 0);   // colorspace
+#endif
  VP8PutBitUniform(bw, 0);   // clamp type

  PutSegmentHeader(bw, enc);
@@ -174,11 +232,20 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
  pos2 = VP8BitWriterPos(bw);
  VP8CodeIntraModes(enc);
  VP8BitWriterFinish(bw);
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (need_extensions && !WriteExtensions(enc)) {
+    return 0;
+  }
+#endif
+
  pos3 = VP8BitWriterPos(bw);

  if (enc->pic_->stats) {
    enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
    enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
+    enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
+    enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_;
  }
  return !bw->error_;
 }
@@ -191,7 +258,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
  int p;

  // Partition #0 with header and partition sizes
-  ok = GeneratePartition0(enc);
+  ok = !!GeneratePartition0(enc);

  // Compute total size (for the RIFF header)
  coded_size = KHEADER_SIZE + VP8BitWriterSize(bw) + 3 * (enc->num_parts_ - 1);
@@ -226,11 +293,11 @@ int VP8EncWrite(VP8Encoder* const enc) {
    ok = pic->writer(pad_byte, 1, pic);
  }

-  enc->coded_size_ = coded_size + KRIFF_SIZE;
+  enc->coded_size_ = (int)coded_size + KRIFF_SIZE;
  return ok;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/enc/tree.c
+++ b/src/enc/tree.c
@@ -15,7 +15,7 @@
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Default probabilities

 // Paragraph 13.5
@@ -343,7 +343,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
  } while (VP8IteratorNext(&it, 0));
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Paragraph 13

 const uint8_t
--- a/src/enc/vp8enci.h
+++ b/src/enc/vp8enci.h
@@ -13,20 +13,24 @@
 #define WEBP_ENC_VP8ENCI_H_

 #include "string.h"     // for memcpy()
-#include "webp/encode.h"
-#include "bit_writer.h"
+#include "../webp/encode.h"
+#include "../dsp/dsp.h"
+#include "../utils/bit_writer.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Various defines and enums

 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 1
-#define ENC_REV_VERSION 2
+#define ENC_REV_VERSION 3
+
+// size of histogram used by CollectHistogram.
+#define MAX_COEFF_THRESH   64

 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -158,7 +162,7 @@ static inline int QUANTDIV(int n, int iQ, int B) {
 }
 extern const uint8_t VP8Zigzag[16];

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Headers

 typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
@@ -184,7 +188,7 @@ typedef struct {
  StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 7.4k
  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 11.4k
  int use_skip_proba_;      // Note: we always use skip_proba for now.
-  int nb_skip_, nb_i4_, nb_i16_;   // block type counters
+  int nb_skip_;             // number of skipped blocks
 } VP8Proba;

 // Filter parameters. Not actually used in the code (we don't perform
@@ -196,19 +200,19 @@ typedef struct {
  int i4x4_lf_delta_;      // delta filter level for i4x4 relative to i16x16
 } VP8FilterHeader;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Informations about the macroblocks.

 typedef struct {
  // block type
-  uint8_t type_:2;     // 0=i4x4, 1=i16x16
-  uint8_t uv_mode_:2;
-  uint8_t skip_:1;
-  uint8_t segment_:2;
+  unsigned int type_:2;     // 0=i4x4, 1=i16x16
+  unsigned int uv_mode_:2;
+  unsigned int skip_:1;
+  unsigned int segment_:2;
  uint8_t alpha_;      // quantization-susceptibility
 } VP8MBInfo;

-typedef struct {
+typedef struct VP8Matrix {
  uint16_t q_[16];        // quantizer steps
  uint16_t iq_[16];       // reciprocals, fixed point.
  uint16_t bias_[16];     // rounding bias
@@ -258,7 +262,7 @@ typedef struct {
  uint8_t*      preds_;            // intra mode predictors (4x4 blocks)
  uint32_t*     nz_;               // non-zero pattern
  uint8_t       i4_boundary_[37];  // 32+5 boundary samples needed by intra4x4
-  uint8_t*      i4_top_;           // pointer to the current *top boundary sample
+  uint8_t*      i4_top_;           // pointer to the current top boundary sample
  int           i4_;               // current intra4x4 mode being tested
  int           top_nz_[9];        // top-non-zero context.
  int           left_nz_[9];       // left-non-zero. left_nz[8] is independent.
@@ -302,7 +306,7 @@ void VP8SetSkip(const VP8EncIterator* const it, int skip);
 void VP8SetSegment(const VP8EncIterator* const it, int segment);
 void VP8IteratorResetCosts(VP8EncIterator* const it);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Encoder

 struct VP8Encoder {
@@ -326,6 +330,17 @@ struct VP8Encoder {
  VP8BitWriter bw_;                         // part0
  VP8BitWriter parts_[MAX_NUM_PARTITIONS];  // token partitions

+  // transparency blob
+  int has_alpha_;
+  uint8_t* alpha_data_;       // non-NULL if transparency is present
+  size_t alpha_data_size_;
+
+  // enhancement layer
+  int use_layer_;
+  VP8BitWriter layer_bw_;
+  uint8_t* layer_data_;
+  size_t layer_data_size_;
+
  // quantization info (one set of DC/AC dequant factor per segment)
  VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
  int base_quant_;                 // nominal quantizer value. Only used
@@ -345,8 +360,9 @@ struct VP8Encoder {
  int      block_count_[3];

  // quality/speed settings
-  int method_;             // 0=fastest, 6=best/slowest.
-  int rd_opt_level_;       // Deduced from method_.
+  int method_;              // 0=fastest, 6=best/slowest.
+  int rd_opt_level_;        // Deduced from method_.
+  int max_i4_header_bits_;  // partition #0 safeness factor

  // Memory
  VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
@@ -366,7 +382,7 @@ struct VP8Encoder {
  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // internal functions. Not public.

  // in tree.c
@@ -403,6 +419,10 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd);
 int VP8EncLoop(VP8Encoder* const enc);
 int VP8StatLoop(VP8Encoder* const enc);

+  // in webpenc.c
+// Assign an error code to a picture. Return false for convenience.
+int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error);
+
  // in analysis.c
 // Main analysis loop. Decides the segmentations and complexity.
 // Assigns a first guess for Intra16 and uvmode_ prediction modes.
@@ -414,58 +434,27 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality);
 // Pick best modes and fills the levels. Returns true if skipped.
 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt);

-  // in dsp.c
-// Transforms
-typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst);
-typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
-typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
-extern VP8Idct VP8ITransform;
-extern VP8Fdct VP8FTransform;
-extern VP8WHT VP8ITransformWHT;
-extern VP8WHT VP8FTransformWHT;
-// Predictions
-// *dst is the destination block. *top, *top_right and *left can be NULL.
-typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
-                              const uint8_t* top);
-typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
-extern VP8Intra4Preds VP8EncPredLuma4;
-extern VP8IntraPreds VP8EncPredLuma16;
-extern VP8IntraPreds VP8EncPredChroma8;
+  // in alpha.c
+void VP8EncInitAlpha(VP8Encoder* enc);           // initialize alpha compression
+void VP8EncCodeAlphaBlock(VP8EncIterator* it);   // analyze or code a macroblock
+int VP8EncFinishAlpha(VP8Encoder* enc);          // finalize compressed data
+void VP8EncDeleteAlpha(VP8Encoder* enc);         // delete compressed data

-typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
-extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
-typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
-                          const uint16_t* const weights);
-extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
-
-typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
-extern VP8BlockCopy VP8Copy4x4;
-extern VP8BlockCopy VP8Copy8x8;
-extern VP8BlockCopy VP8Copy16x16;
-// Quantization
-typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
-                                int n, const VP8Matrix* const mtx);
-extern VP8QuantizeBlock VP8EncQuantizeBlock;
-
-typedef enum {
-  kSSE2,
-  kSSE3
-} CPUFeature;
-// returns true if the CPU supports the feature.
-typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo CPUInfo;
-
-void VP8EncDspInit(void);   // must be called before using any of the above
+  // in layer.c
+void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
+void VP8EncCodeLayerBlock(VP8EncIterator* it);   // code one more macroblock
+int VP8EncFinishLayer(VP8Encoder* const enc);    // finalize coding
+void VP8EncDeleteLayer(VP8Encoder* enc);         // reclaim memory

  // in filter.c
 extern void VP8InitFilter(VP8EncIterator* const it);
 extern void VP8StoreFilterStats(VP8EncIterator* const it);
 extern void VP8AdjustFilterStrength(VP8EncIterator* const it);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif  // WEBP_ENC_VP8ENCI_H_
+#endif  /* WEBP_ENC_VP8ENCI_H_ */
--- a/src/enc/webpenc.c
+++ b/src/enc/webpenc.c
@@ -9,6 +9,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
@@ -25,17 +26,15 @@ extern "C" {
 #include <stdio.h>
 #endif

-#define MAX_DIMENSION 16384   // maximum width/height allowed by the spec
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int WebPGetEncoderVersion(void) {
  return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPPicture
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static int DummyWriter(const uint8_t* data, size_t data_size,
                       const WebPPicture* const picture) {
@@ -53,13 +52,14 @@ int WebPPictureInitInternal(WebPPicture* const picture, int version) {
  if (picture) {
    memset(picture, 0, sizeof(*picture));
    picture->writer = DummyWriter;
+    WebPEncodingSetError(picture, VP8_ENC_OK);
  }
  return 1;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8Encoder
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static void ResetSegmentHeader(VP8Encoder* const enc) {
  VP8SegmentHeader* const hdr = &enc->segment_hdr_;
@@ -110,11 +110,15 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {

 static void MapConfigToTools(VP8Encoder* const enc) {
  const int method = enc->config_->method;
+  const int limit = 100 - enc->config_->partition_limit;
  enc->method_ = method;
  enc->rd_opt_level_ = (method >= 6) ? 3
                     : (method >= 5) ? 2
                     : (method >= 3) ? 1
                     : 0;
+  enc->max_i4_header_bits_ =
+      256 * 16 * 16 *                 // upper bound: up to 16bit per 4x4 block
+      (limit * limit) / (100 * 100);  // ... modulated with a quadratic curve.
 }

 // Memory scaling with dimensions:
@@ -155,7 +159,8 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
                               2 * ALIGN_CST)           // align all
                               * sizeof(uint8_t);
-  const size_t lf_stats_size = config->autofilter ? sizeof(LFStats) : 0;
+  const size_t lf_stats_size =
+      config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
  VP8Encoder* enc;
  uint8_t* mem;
  size_t size = sizeof(VP8Encoder) + ALIGN_CST  // main struct
@@ -193,7 +198,10 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
  printf("===================================\n");
 #endif
  mem = (uint8_t*)malloc(size);
-  if (mem == NULL) return NULL;
+  if (mem == NULL) {
+    WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    return NULL;
+  }
  enc = (VP8Encoder*)mem;
  mem = (uint8_t*)DO_ALIGN(mem + sizeof(*enc));
  memset(enc, 0, sizeof(*enc));
@@ -215,7 +223,7 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
  mem += preds_w * preds_h * sizeof(uint8_t);
  enc->nz_ = 1 + (uint32_t*)mem;
  mem += nz_size;
-  enc->lf_stats_ = lf_stats_size ? (LFStats*)mem : NULL;
+  enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
  mem += lf_stats_size;

  // top samples (all 16-aligned)
@@ -242,14 +250,25 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config,
  ResetFilterHeader(enc);
  ResetBoundaryPredictions(enc);

+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  VP8EncInitAlpha(enc);
+  VP8EncInitLayer(enc);
+#endif
+
  return enc;
 }

 static void DeleteEncoder(VP8Encoder* enc) {
-  free(enc);
+  if (enc) {
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    VP8EncDeleteAlpha(enc);
+    VP8EncDeleteLayer(enc);
+#endif
+    free(enc);
+  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 static double GetPSNR(uint64_t err, uint64_t size) {
  return err ? 10. * log10(255. * 255. * size / err) : 99.;
@@ -284,31 +303,46 @@ static void StoreStats(VP8Encoder* const enc) {
  }
 }

-//-----------------------------------------------------------------------------
+int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error) {
+  assert((int)error <= VP8_ENC_ERROR_BAD_WRITE);
+  assert((int)error >= VP8_ENC_OK);
+  pic->error_code = error;
+  return 0;
+}
+
+//------------------------------------------------------------------------------

 int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) {
  VP8Encoder* enc;
  int ok;

-  if (config == NULL || pic == NULL)
-    return 0;   // bad params
+  if (pic == NULL)
+    return 0;
+  WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
+  if (config == NULL)  // bad params
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
  if (!WebPValidateConfig(config))
-    return 0;   // invalid config.
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
  if (pic->width <= 0 || pic->height <= 0)
-    return 0;   // invalid parameters
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
  if (pic->y == NULL || pic->u == NULL || pic->v == NULL)
-    return 0;   // invalid parameters
-  if (pic->width >= MAX_DIMENSION || pic->height >= MAX_DIMENSION)
-    return 0;   // image is too big
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
+    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);

  enc = InitEncoder(config, pic);
-  if (enc == NULL) return 0;
+  if (enc == NULL) return 0;  // pic->error is already set.
  ok = VP8EncAnalyze(enc)
    && VP8StatLoop(enc)
    && VP8EncLoop(enc)
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    && VP8EncFinishAlpha(enc)
+    && VP8EncFinishLayer(enc)
+#endif
    && VP8EncWrite(enc);
  StoreStats(enc);
  DeleteEncoder(enc);
+
  return ok;
 }

--- a/src/utils/Makefile.am
+++ b/src/utils/Makefile.am
@@ -0,0 +1,13 @@
+AM_CPPFLAGS = -I$(top_srcdir)/src
+
+libwebputils_la_SOURCES = bit_reader.h bit_reader.c \
+                          bit_writer.h bit_writer.c \
+                          thread.h thread.c
+libwebputils_la_LDFLAGS = -version-info 0:0:0
+libwebputils_la_CPPFLAGS = $(USE_EXPERIMENTAL_CODE)
+libwebputilsinclude_HEADERS = ../webp/types.h
+libwebputilsincludedir = $(includedir)/webp
+
+noinst_HEADERS = bit_reader.h bit_writer.h thread.h
+
+noinst_LTLIBRARIES = libwebputils.la
--- a/src/utils/bit_reader.c
+++ b/src/utils/bit_reader.c
@@ -9,13 +9,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#include "bits.h"
+#include "./bit_reader.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8BitReader

 void VP8InitBitReader(VP8BitReader* const br,
@@ -56,7 +56,7 @@ const uint8_t kVP8NewRange[128] = {
  241, 243, 245, 247, 249, 251, 253, 127
 };

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Higher-level calls

 uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
@@ -72,7 +72,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
  return VP8Get(br) ? -value : value;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/utils/bit_reader.h
+++ b/src/utils/bit_reader.h
@@ -9,20 +9,21 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_DEC_BITS_H_
-#define WEBP_DEC_BITS_H_
+#ifndef WEBP_UTILS_BIT_READER_H_
+#define WEBP_UTILS_BIT_READER_H_

 #include <assert.h>
-#include "webp/decode_vp8.h"
+#include "../webp/decode_vp8.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Bitreader and code-tree reader

-typedef struct {
+typedef struct VP8BitReader VP8BitReader;
+struct VP8BitReader {
  const uint8_t* buf_;        // next byte to be read
  const uint8_t* buf_end_;    // end of read buffer
  int eof_;                   // true if input is exhausted
@@ -31,7 +32,7 @@ typedef struct {
  uint32_t range_;            // current range minus 1. In [127, 254] interval.
  uint32_t value_;            // current value
  int missing_;               // number of missing bits in value_ (8bit)
-} VP8BitReader;
+};

 // Initialize the bit reader and the boolean decoder.
 void VP8InitBitReader(VP8BitReader* const br,
@@ -61,15 +62,16 @@ static inline uint32_t VP8GetByte(VP8BitReader* const br) {

 static inline uint32_t VP8BitUpdate(VP8BitReader* const br, uint32_t split) {
  uint32_t bit;
+  const uint32_t value_split = (split + 1) << 8;
  // Make sure we have a least 8 bits in 'value_'
  if (br->missing_ > 0) {
    br->value_ |= VP8GetByte(br) << br->missing_;
    br->missing_ -= 8;
  }
-  bit = ((br->value_ >> 8) > split);
+  bit = (br->value_ >= value_split);
  if (bit) {
    br->range_ -= split + 1;
-    br->value_ -= (split + 1) << 8;
+    br->value_ -= value_split;
  } else {
    br->range_ = split;
  }
@@ -104,4 +106,4 @@ static inline int VP8GetSigned(VP8BitReader* const br, int v) {
 }    // extern "C"
 #endif

-#endif  // WEBP_DEC_BITS_H_
+#endif  /* WEBP_UTILS_BIT_READER_H_ */
--- a/src/utils/bit_writer.c
+++ b/src/utils/bit_writer.c
@@ -10,14 +10,15 @@
 // Author: Skal (pascal.massimino@gmail.com)

 #include <assert.h>
+#include <string.h>   // for memcpy()
 #include <stdlib.h>
-#include "vp8enci.h"
+#include "./bit_writer.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // VP8BitWriter

 static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
@@ -68,7 +69,7 @@ static void kFlush(VP8BitWriter* const bw) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // renormalization

 static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
@@ -84,7 +85,7 @@ static const uint8_t kNorm[128] = {  // renorm_sizes[i] = 8 - log2(i)
 };

 // range = ((range + 1) << kVP8Log2Range[range]) - 1
-const uint8_t kNewRange[128] = {
+static const uint8_t kNewRange[128] = {
  127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239,
  127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239,
  247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179,
@@ -147,7 +148,7 @@ void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) {
  }
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) {
  bw->range_   = 255 - 1;
@@ -168,7 +169,17 @@ uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) {
  return bw->buf_;
 }

-//-----------------------------------------------------------------------------
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size) {
+  assert(data);
+  if (bw->nb_bits_ != -8) return 0;   // kFlush() must have been called
+  if (!BitWriterResize(bw, size)) return 0;
+  memcpy(bw->buf_ + bw->pos_, data, size);
+  bw->pos_ += size;
+  return 1;
+}
+
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/utils/bit_writer.h
+++ b/src/utils/bit_writer.h
@@ -9,16 +9,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)

-#ifndef WEBP_ENC_BIT_WRITER_H_
-#define WEBP_ENC_BIT_WRITER_H_
+#ifndef WEBP_UTILS_BIT_WRITER_H_
+#define WEBP_UTILS_BIT_WRITER_H_

-#include "vp8enci.h"
+#include "../webp/types.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Bit-writing

 typedef struct VP8BitWriter VP8BitWriter;
@@ -39,6 +39,8 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob);
 int VP8PutBitUniform(VP8BitWriter* const bw, int bit);
 void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits);
 void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits);
+int VP8BitWriterAppend(VP8BitWriter* const bw,
+                       const uint8_t* data, size_t size);

 // return approximate write position (in bits)
 static inline uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
@@ -52,10 +54,10 @@ static inline size_t VP8BitWriterSize(const VP8BitWriter* const bw) {
  return bw->pos_;
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif

-#endif  // WEBP_ENC_BIT_WRITER_H_
+#endif  /* WEBP_UTILS_BIT_WRITER_H_ */
--- a/src/utils/thread.c
+++ b/src/utils/thread.c
@@ -0,0 +1,243 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Author: skal@google.com (Pascal Massimino)
+
+#include <assert.h>
+#include <string.h>   // for memset()
+#include "./thread.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef WEBP_USE_THREAD
+
+#if defined(_WIN32)
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#include <process.h>
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static int pthread_create(pthread_t* const thread, const void* attr,
+                          unsigned int (__stdcall *start)(void*), void* arg) {
+  (void)attr;
+  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
+                                      0,      /* unsigned stack_size */
+                                      start,
+                                      arg,
+                                      0,      /* unsigned initflag */
+                                      NULL);  /* unsigned *thrdaddr */
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t* const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t* const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static int pthread_cond_destroy(pthread_cond_t* const condition) {
+  int ok = 1;
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+  return !ok;
+}
+
+static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
+  (void)cond_attr;
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL ||
+      condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  return 0;
+}
+
+static int pthread_cond_signal(pthread_cond_t* const condition) {
+  int ok = 1;
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+  return !ok;
+}
+
+static int pthread_cond_wait(pthread_cond_t* const condition,
+                             pthread_mutex_t* const mutex) {
+  int ok;
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
+    return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+  return !ok;
+}
+
+#else  // _WIN32
+# define THREADFN void*
+# define THREAD_RETURN(val) val
+#endif
+
+//------------------------------------------------------------------------------
+
+static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
+  WebPWorker* const worker = (WebPWorker*)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->mutex_);
+    while (worker->status_ == OK) {   // wait in idling mode
+      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      if (worker->hook) {
+        worker->had_error |= !worker->hook(worker->data1, worker->data2);
+      }
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {   // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for Sync())
+    pthread_cond_signal(&worker->condition_);
+    pthread_mutex_unlock(&worker->mutex_);
+  }
+  return THREAD_RETURN(NULL);    // Thread is finished
+}
+
+// main thread state control
+static void WebPWorkerChangeState(WebPWorker* const worker,
+                                  WebPWorkerStatus new_status) {
+  // no-op when attempting to change state on a thread that didn't come up
+  if (worker->status_ < OK) return;
+
+  pthread_mutex_lock(&worker->mutex_);
+  // wait for the worker to finish
+  while (worker->status_ != OK) {
+    pthread_cond_wait(&worker->condition_, &worker->mutex_);
+  }
+  // assign new status and release the working thread if needed
+  if (new_status != OK) {
+    worker->status_ = new_status;
+    pthread_cond_signal(&worker->condition_);
+  }
+  pthread_mutex_unlock(&worker->mutex_);
+}
+
+#endif
+
+//------------------------------------------------------------------------------
+
+void WebPWorkerInit(WebPWorker* const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+int WebPWorkerSync(WebPWorker* const worker) {
+#ifdef WEBP_USE_THREAD
+  WebPWorkerChangeState(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+int WebPWorkerReset(WebPWorker* const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#ifdef WEBP_USE_THREAD
+    if (pthread_mutex_init(&worker->mutex_, NULL) ||
+        pthread_cond_init(&worker->condition_, NULL)) {
+      return 0;
+    }
+    pthread_mutex_lock(&worker->mutex_);
+    ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->mutex_);
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = WebPWorkerSync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+void WebPWorkerLaunch(WebPWorker* const worker) {
+#ifdef WEBP_USE_THREAD
+  WebPWorkerChangeState(worker, WORK);
+#else
+  if (worker->hook)
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+#endif
+}
+
+void WebPWorkerEnd(WebPWorker* const worker) {
+  if (worker->status_ >= OK) {
+#ifdef WEBP_USE_THREAD
+    WebPWorkerChangeState(worker, NOT_OK);
+    pthread_join(worker->thread_, NULL);
+    pthread_mutex_destroy(&worker->mutex_);
+    pthread_cond_destroy(&worker->condition_);
+#else
+    worker->status_ = NOT_OK;
+#endif
+  }
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/src/utils/thread.h
+++ b/src/utils/thread.h
@@ -0,0 +1,86 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Author: skal@google.com (Pascal Massimino)
+
+#ifndef WEBP_UTILS_THREAD_H_
+#define WEBP_UTILS_THREAD_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if WEBP_USE_THREAD
+
+#if defined(_WIN32)
+
+#include <windows.h>
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+
+#else
+
+#include <pthread.h>
+
+#endif    /* _WIN32 */
+#endif    /* WEBP_USE_THREAD */
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,   // object is unusable
+  OK,           // ready to work
+  WORK          // busy finishing the current task
+} WebPWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2), and should return false in case of error.
+typedef int (*WebPWorkerHook)(void*, void*);
+
+// Synchronize object used to launch job in the worker thread
+typedef struct {
+#if WEBP_USE_THREAD
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+#endif
+  WebPWorkerStatus status_;
+  WebPWorkerHook hook;    // hook to call
+  void* data1;            // first argument passed to 'hook'
+  void* data2;            // second argument passed to 'hook'
+  int had_error;          // return value of the last call to 'hook'
+} WebPWorker;
+
+// Must be called first, before any other method.
+void WebPWorkerInit(WebPWorker* const worker);
+// Must be called initialize the object and spawn the thread. Re-entrant.
+// Will potentially launch the thread. Returns false in case of error.
+int WebPWorkerReset(WebPWorker* const worker);
+// Make sure the previous work is finished. Returns true if worker->had_error
+// was not set and not error condition was triggered by the working thread.
+int WebPWorkerSync(WebPWorker* const worker);
+// Trigger the thread to call hook() with data1 and data2 argument. These
+// hook/data1/data2 can be changed at any time before calling this function,
+// but not be changed afterward until the next call to WebPWorkerSync().
+void WebPWorkerLaunch(WebPWorker* const worker);
+// Kill the thread and terminate the object. To use the object again, one
+// must call WebPWorkerReset() again.
+void WebPWorkerEnd(WebPWorker* const worker);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_THREAD_H_ */
--- a/src/webp/decode.h
+++ b/src/webp/decode.h
@@ -12,40 +12,46 @@
 #ifndef WEBP_WEBP_DECODE_H_
 #define WEBP_WEBP_DECODE_H_

-#include "webp/types.h"
+#include "./types.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

+#define WEBP_DECODER_ABI_VERSION 0x0002
+
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-int WebPGetDecoderVersion(void);
+WEBP_EXTERN(int) WebPGetDecoderVersion(void);

 // Retrieve basic header information: width, height.
 // This function will also validate the header and return 0 in
 // case of formatting error.
 // Pointers *width/*height can be passed NULL if deemed irrelevant.
-int WebPGetInfo(const uint8_t* data, uint32_t data_size,
-                int *width, int *height);
+WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, uint32_t data_size,
+                             int* width, int* height);

 // Decodes WEBP images pointed to by *data and returns RGB samples, along
 // with the dimensions in *width and *height.
 // The returned pointer should be deleted calling free().
 // Returns NULL in case of error.
-uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+                                    int* width, int* height);

 // Same as WebPDecodeRGB, but returning RGBA data.
-uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning ARGB data.
+WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
+                                     int* width, int* height);

 // This variant decode to BGR instead of RGB.
-uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+                                    int* width, int* height);
 // This variant decodes to BGRA instead of RGBA.
-uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
-                        int *width, int *height);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
+                                     int* width, int* height);

 // Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is
 // the Y samples buffer. Upon return, *u and *v will point to the U and V
@@ -56,11 +62,12 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
-uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
-                       int *width, int *height, uint8_t** u, uint8_t** v,
-                       int *stride, int* uv_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
+                                    int* width, int* height,
+                                    uint8_t** u, uint8_t** v,
+                                    int* stride, int* uv_stride);

-// These three functions are variants of the above ones, that decode the image
+// These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
 // available in this buffer is indicated by 'output_buffer_size'. If this
 // storage is not sufficient (or an error occurred), NULL is returned.
@@ -68,19 +75,22 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size,
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
-uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output_buffer, int output_buffer_size,
-                           int output_stride);
-uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output_buffer, int output_buffer_size,
-                            int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);
 // BGR variants
-uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* output_buffer, int output_buffer_size,
-                           int output_stride);
-uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
-                            uint8_t* output_buffer, int output_buffer_size,
-                            int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);

 // WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly
 // into pre-allocated luma/chroma plane buffers. This function requires the
@@ -89,19 +99,72 @@ uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size,
 // 'u_size' and 'v_size' respectively.
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
-uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size,
-                           uint8_t* luma, int luma_size, int luma_stride,
-                           uint8_t* u, int u_size, int u_stride,
-                           uint8_t* v, int v_size, int v_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
+    const uint8_t* data, uint32_t data_size,
+    uint8_t* luma, int luma_size, int luma_stride,
+    uint8_t* u, int u_size, int u_stride,
+    uint8_t* v, int v_size, int v_stride);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// Output colorspaces and buffer

-// Output colorspaces
+// Colorspaces
 typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
               MODE_BGR = 2, MODE_BGRA = 3,
-               MODE_YUV = 4 } WEBP_CSP_MODE;
+               MODE_ARGB = 4, MODE_RGBA_4444 = 5,
+               MODE_RGB_565 = 6,
+               // YUV modes must come after RGB ones.
+               MODE_YUV = 7, MODE_YUVA = 8,  // yuv 4:2:0
+               MODE_LAST = 9
+             } WEBP_CSP_MODE;

+// Generic structure for describing the sample buffer.
+typedef struct {    // view as RGBA
+  uint8_t* rgba;    // pointer to RGBA samples
+  int stride;       // stride in bytes from one scanline to the next.
+  int size;         // total size of the *rgba buffer.
+} WebPRGBABuffer;
+
+typedef struct {              // view as YUVA
+  uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
+  int y_stride;               // luma stride
+  int u_stride, v_stride;     // chroma strides
+  int a_stride;               // alpha stride
+  int y_size;                 // luma plane size
+  int u_size, v_size;         // chroma planes size
+  int a_size;                 // alpha-plane size
+} WebPYUVABuffer;
+
+// Output buffer
+typedef struct {
+  WEBP_CSP_MODE colorspace;  // Colorspace.
+  int width, height;         // Dimensions.
+  int is_external_memory;    // If true, 'internal_memory' pointer is not used.
+  union {
+    WebPRGBABuffer RGBA;
+    WebPYUVABuffer YUVA;
+  } u;                       // Nameless union of buffer parameters.
+  uint8_t* private_memory;   // Internally allocated memory (only when
+                             // is_external_memory is false). Should not be used
+                             // externally, but accessed via the buffer union.
+} WebPDecBuffer;
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer* const, int);
+
+// Initialize the structure as empty. Must be called before any other use.
+// Returns false in case of version mismatch
+static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) {
+  return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
+}
+
+// Free any memory associated with the buffer. Must always be called last.
+// Note: doesn't free the 'buffer' structure itself.
+WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* const buffer);
+
+//------------------------------------------------------------------------------
 // Enumeration of the status codes
+
 typedef enum {
  VP8_STATUS_OK = 0,
  VP8_STATUS_OUT_OF_MEMORY,
@@ -113,11 +176,11 @@ typedef enum {
  VP8_STATUS_NOT_ENOUGH_DATA
 } VP8StatusCode;

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Incremental decoding
 //
-//  This API allows streamlined decoding of partial data.
-//  Picture can be incrementally decoded as data become available thanks to the
+// This API allows streamlined decoding of partial data.
+// Picture can be incrementally decoded as data become available thanks to the
 // WebPIDecoder object. This object can be left in a SUSPENDED state if the
 // picture is only partially decoded, pending additional input.
 // Code example:
@@ -138,16 +201,26 @@ typedef enum {

 typedef struct WebPIDecoder WebPIDecoder;

+// Creates a new incremental decoder with the supplied buffer parameter.
+// This output_buffer can be passed NULL, in which case a default output buffer
+// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
+// is kept, which means that the lifespan of 'output_buffer' must be larger than
+// that of the returned WebPIDecoder object.
+// Returns NULL if the allocation failed.
+WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* const output_buffer);
+
 // Creates a WebPIDecoder object. Returns NULL in case of failure.
-WebPIDecoder* WebPINew(WEBP_CSP_MODE mode);
+// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder().
+WEBP_EXTERN(WebPIDecoder*) WebPINew(WEBP_CSP_MODE mode);

 // This function allocates and initializes an incremental-decoder object, which
 // will output the r/g/b(/a) samples specified by 'mode' into a preallocated
 // buffer 'output_buffer'. The size of this buffer is at least
 // 'output_buffer_size' and the stride (distance in bytes between two scanlines)
 // is specified by 'output_stride'. Returns NULL if the allocation failed.
-WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
-                          int output_buffer_size, int output_stride);
+WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
+    WEBP_CSP_MODE mode,
+    uint8_t* output_buffer, int output_buffer_size, int output_stride);

 // This function allocates and initializes an incremental-decoder object, which
 // will output the raw luma/chroma samples into a preallocated planes. The luma
@@ -156,41 +229,165 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
 // 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v', 'v_size'
 // and 'v_size'.
 // Returns NULL if the allocation failed.
-WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride,
-                          uint8_t* u, int u_size, int u_stride,
-                          uint8_t* v, int v_size, int v_stride);
+WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
+    uint8_t* luma, int luma_size, int luma_stride,
+    uint8_t* u, int u_size, int u_stride,
+    uint8_t* v, int v_size, int v_stride);

-// Deletes the WebpBuffer object and associated memory. Must always be called
+// Deletes the WebPIDecoder object and associated memory. Must always be called
 // if WebPINew, WebPINewRGB or WebPINewYUV succeeded.
-void WebPIDelete(WebPIDecoder* const idec);
+WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* const idec);

 // Copies and decodes the next available data. Returns VP8_STATUS_OK when
 // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
 // data is expected. Returns error in other cases.
-VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size);
+WEBP_EXTERN(VP8StatusCode) WebPIAppend(
+    WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);

 // A variant of the above function to be used when data buffer contains
 // partial data from the beginning. In this case data buffer is not copied
 // to the internal memory.
 // Note that the value of the 'data' pointer can change between calls to
 // WebPIUpdate, for instance when the data buffer is resized to fit larger data.
-VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data,
-                          uint32_t data_size);
+WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
+    WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size);

-// Returns the RGB image decoded so far. Returns NULL if output params are not
-// initialized yet. *last_y is the index of last decoded row in raster scan
-// order. Some pointers (*last_y, *width etc.) can be NULL if corresponding
-// information is not needed.
-uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y,
-                        int* width, int* height, int* stride);
+// Returns the r/g/b/(a) image decoded so far. Returns NULL if output params
+// are not initialized yet. The r/g/b/(a) output type corresponds to the mode
+// specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded
+// row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if
+// corresponding information is not needed.
+WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
+    const WebPIDecoder* const idec, int* last_y,
+    int* width, int* height, int* stride);

 // Same as above function to get YUV image. Returns pointer to the luma plane
 // or NULL in case of error.
-uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y,
-                        uint8_t** u, uint8_t** v,
-                        int* width, int* height, int* stride, int* uv_stride);
+WEBP_EXTERN(uint8_t*) WebPIDecGetYUV(
+    const WebPIDecoder* const idec, int* last_y,
+    uint8_t** u, uint8_t** v,
+    int* width, int* height, int* stride, int* uv_stride);

+// Generic call to retrieve information about the displayable area.
+// If non NULL, the left/right/width/height pointers are filled with the visible
+// rectangular area so far.
+// Returns NULL in case the incremental decoder object is in an invalid state.
+// Otherwise returns the pointer to the internal representation. This structure
+// is read-only, tied to WebPIDecoder's lifespan and should not be modified.
+WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
+    const WebPIDecoder* const idec,
+    int* const left, int* const top,
+    int* const width, int* const height);
+
+//------------------------------------------------------------------------------
+// Advanced decoding parametrization
+//
+//  Code sample for using the advanced decoding API
+/*
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config', if needed
+     config.no_fancy = 1;
+     config.output.colorspace = MODE_BGRA;
+     // etc.
+
+     // Note that you can also make config.output point to an externally
+     // supplied memory buffer, provided it's big enough to store the decoded
+     // picture. Otherwise, config.output will just be used to allocate memory
+     // and store the decoded picture.
+
+     // D) Decode!
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E) Decoded image is now in config.output (and config.output.u.RGBA)
+
+     // F) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+*/
+
+// Features gathered from the bitstream
+typedef struct {
+  int width;        // the original width, as read from the bitstream
+  int height;       // the original height, as read from the bitstream
+  int has_alpha;    // true if bitstream contains an alpha channel
+  int no_incremental_decoding;  // if true, using incremental decoding is not
+                                // recommended.
+  int rotate;                   // TODO(later)
+  int uv_sampling;              // should be 0 for now. TODO(later)
+  int bitstream_version;        // should be 0 for now. TODO(later)
+} WebPBitstreamFeatures;
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
+    const uint8_t*, uint32_t, WebPBitstreamFeatures* const, int);
+
+// Retrieve features from the bitstream. The *features structure is filled
+// with information gathered from the bitstream.
+// Returns false in case of error or version mismatch.
+// In case of error, features->bitstream_status will reflect the error code.
+static inline
+  VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size,
+                                WebPBitstreamFeatures* const features) {
+  return WebPGetFeaturesInternal(data, data_size, features,
+                                 WEBP_DECODER_ABI_VERSION);
+}
+
+// Decoding options
+typedef struct {
+  int bypass_filtering;               // if true, skip the in-loop filtering
+  int no_fancy_upsampling;            // if true, use faster pointwise upsampler
+  int use_cropping;                   // if true, cropping is applied _first_
+  int crop_left, crop_top;            // top-left position for cropping.
+                                      // Will be snapped to even values.
+  int crop_width, crop_height;        // dimension of the cropping area
+  int use_scaling;                    // if true, scaling is applied _afterward_
+  int scaled_width, scaled_height;    // final resolution
+  int force_rotation;                 // forced rotation (to be applied _last_)
+  int no_enhancement;                 // if true, discard enhancement layer
+  int use_threads;                    // if true, use multi-threaded decoding
+} WebPDecoderOptions;
+
+// Main object storing the configuration for advanced decoding.
+typedef struct {
+  WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
+  WebPDecBuffer output;         // Output buffer (can point to external mem)
+  WebPDecoderOptions options;   // Decoding options
+} WebPDecoderConfig;
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int);
+
+// Initialize the configuration as empty. This function must always be
+// called first, unless WebPGetFeatures() is to be called.
+// Returns false in case of mismatched version.
+static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) {
+  return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
+}
+
+// Instantiate a new incremental decoder object with requested configuration.
+// The bitstream can be passed using *data and data_size parameter,
+// in which case the features will be parsed and stored into config->input.
+// Otherwise, 'data' can be NULL and now parsing will occur.
+// Note that 'config' can be NULL too, in which case a default configuration is
+// used.
+// The return WebPIDecoder object must always be deleted calling WebPIDelete().
+// Returns NULL in case of error (and config->status will then reflect
+// the error condition).
+WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, uint32_t data_size,
+                                       WebPDecoderConfig* const config);
+
+// Non-incremental version. This version decodes the full data at once, taking
+// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding
+// was successful).
+WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, uint32_t data_size,
+                                      WebPDecoderConfig* const config);

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/webp/decode_vp8.h
+++ b/src/webp/decode_vp8.h
@@ -12,18 +12,16 @@
 #ifndef WEBP_WEBP_DECODE_VP8_H_
 #define WEBP_WEBP_DECODE_VP8_H_

-#include "webp/decode.h"
+#include "./decode.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#define WEBP_DECODER_ABI_VERSION 0x0001
-
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Lower-level API
 //
-// Thes functions provide fine-grained control of the decoding process.
+// These functions provide fine-grained control of the decoding process.
 // The call flow should resemble:
 //
 //   VP8Io io;
@@ -40,14 +38,22 @@ extern "C" {

 // Input / Output
 typedef struct VP8Io VP8Io;
+typedef int (*VP8IoPutHook)(const VP8Io* io);
+typedef int (*VP8IoSetupHook)(VP8Io* io);
+typedef void (*VP8IoTeardownHook)(const VP8Io* io);
+
 struct VP8Io {
  // set by VP8GetHeaders()
-  int width, height;         // picture dimensions, in pixels
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.

  // set before calling put()
  int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
  int mb_h;                  // number of rows in the sample
-  const uint8_t *y, *u, *v;  // rows to copy (in yuv420 format)
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
  int y_stride;              // row stride for luma
  int uv_stride;             // row stride for chroma

@@ -56,20 +62,24 @@ struct VP8Io {
  // called when fresh samples are available. Currently, samples are in
  // YUV420 format, and can be up to width x 24 in size (depending on the
  // in-loop filtering level, e.g.). Should return false in case of error
-  // or abort request.
-  int (*put)(const VP8Io* io);
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
+  VP8IoPutHook put;

  // called just before starting to decode the blocks.
-  // Should returns 0 in case of error.
-  int (*setup)(VP8Io* io);
+  // Must return false in case of setup error, true otherwise. If false is
+  // returned, teardown() will NOT be called. But if the setup succeeded
+  // and true is returned, then teardown() will always be called afterward.
+  VP8IoSetupHook setup;

-  // called just after block decoding is finished (or when an error occurred).
-  void (*teardown)(const VP8Io* io);
+  // Called just after block decoding is finished (or when an error occurred
+  // during put()). Is NOT called if setup() failed.
+  VP8IoTeardownHook teardown;

  // this is a recommendation for the user-side yuv->rgb converter. This flag
  // is set when calling setup() hook and can be overwritten by it. It then
  // can be taken into consideration during the put() method.
-  int fancy_upscaling;
+  int fancy_upsampling;

  // Input buffer.
  uint32_t data_size;
@@ -80,16 +90,36 @@ struct VP8Io {
  // of more visible blocking. Note that output will also be non-compliant
  // with the VP8 specifications.
  int bypass_filtering;
+
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
+  // pointer to the alpha data (if present) corresponding to the rows
+  const uint8_t* a;
 };

 // Internal, version-checked, entry point
-int VP8InitIoInternal(VP8Io* const, int);
+WEBP_EXTERN(int) VP8InitIoInternal(VP8Io* const, int);
+
+// Set the custom IO function pointers and user-data. The setter for IO hooks
+// should be called before initiating incremental decoding. Returns true if
+// WebPIDecoder object is successfully modified, false otherwise.
+WEBP_EXTERN(int) WebPISetIOHooks(WebPIDecoder* const idec,
+                                 VP8IoPutHook put,
+                                 VP8IoSetupHook setup,
+                                 VP8IoTeardownHook teardown,
+                                 void* user_data);

 // Main decoding object. This is an opaque structure.
 typedef struct VP8Decoder VP8Decoder;

 // Create a new decoder object.
-VP8Decoder* VP8New(void);
+WEBP_EXTERN(VP8Decoder*) VP8New(void);

 // Must be called to make sure 'io' is initialized properly.
 // Returns false in case of version mismatch. Upon such failure, no other
@@ -99,26 +129,26 @@ static inline int VP8InitIo(VP8Io* const io) {
 }

 // Start decoding a new picture. Returns true if ok.
-int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+WEBP_EXTERN(int) VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);

 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
 // Returns false in case of error.
-int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+WEBP_EXTERN(int) VP8Decode(VP8Decoder* const dec, VP8Io* const io);

 // Return current status of the decoder:
-VP8StatusCode VP8Status(VP8Decoder* const dec);
+WEBP_EXTERN(VP8StatusCode) VP8Status(VP8Decoder* const dec);

 // return readable string corresponding to the last status.
-const char* VP8StatusMessage(VP8Decoder* const dec);
+WEBP_EXTERN(const char*) VP8StatusMessage(VP8Decoder* const dec);

 // Resets the decoder in its initial state, reclaiming memory.
 // Not a mandatory call between calls to VP8Decode().
-void VP8Clear(VP8Decoder* const dec);
+WEBP_EXTERN(void) VP8Clear(VP8Decoder* const dec);

 // Destroy the decoder object.
-void VP8Delete(VP8Decoder* const dec);
+WEBP_EXTERN(void) VP8Delete(VP8Decoder* const dec);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/webp/encode.h
+++ b/src/webp/encode.h
@@ -14,35 +14,38 @@

 #include <stdlib.h>

-#include "webp/types.h"
+#include "./types.h"

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif

-#define WEBP_ENCODER_ABI_VERSION 0x0001
+#define WEBP_ENCODER_ABI_VERSION 0x0002

 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-int WebPGetEncoderVersion(void);
+WEBP_EXTERN(int) WebPGetEncoderVersion(void);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // One-stop-shop call! No questions asked:

 // Returns the size of the compressed data (pointed to by *output), or 0 if
 // an error occurred. The compressed data must be released by the caller
 // using the call 'free(*output)'.
-// Currently, alpha values are discarded.
-size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride,
-                     float quality_factor, uint8_t** output);
-size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride,
-                     float quality_factor, uint8_t** output);
-size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride,
-                      float quality_factor, uint8_t** output);
-size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride,
-                      float quality_factor, uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb,
+                                  int width, int height, int stride,
+                                  float quality_factor, uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr,
+                                  int width, int height, int stride,
+                                  float quality_factor, uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba,
+                                   int width, int height, int stride,
+                                   float quality_factor, uint8_t** output);
+WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra,
+                                   int width, int height, int stride,
+                                   float quality_factor, uint8_t** output);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Coding parameters

 typedef struct {
@@ -66,6 +69,9 @@ typedef struct {
  int preprocessing;     // preprocessing filter (0=none, 1=segment-smooth)
  int partitions;        // log2(number of token partitions) in [0..3]
                         // Default is set to 0 for easier progressive decoding.
+  int partition_limit;   // quality degradation allowed to fit the 512k limit on
+                         // prediction modes coding (0=no degradation, 100=full)
+  int alpha_compression;  // Algorithm for optimizing the alpha plane (0 = none)
 } WebPConfig;

 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -80,7 +86,8 @@ typedef enum {
 } WebPPreset;

 // Internal, version-checked, entry point
-int WebPConfigInitInternal(WebPConfig* const, WebPPreset, float, int);
+WEBP_EXTERN(int) WebPConfigInitInternal(
+    WebPConfig* const, WebPPreset, float, int);

 // Should always be called, to initialize a fresh WebPConfig structure before
 // modification. Returns 0 in case of version mismatch. WebPConfigInit() must
@@ -101,25 +108,28 @@ static inline int WebPConfigPreset(WebPConfig* const config,
 }

 // Returns 1 if all parameters are in valid range and the configuration is OK.
-int WebPValidateConfig(const WebPConfig* const config);
+WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* const config);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Input / Output

 typedef struct WebPPicture WebPPicture;   // main structure for I/O

-// non-essential structure for storing auxilliary statistics
+// non-essential structure for storing auxiliary statistics
 typedef struct {
  float PSNR[4];          // peak-signal-to-noise ratio for Y/U/V/All
  int coded_size;         // final size
  int block_count[3];     // number of intra4/intra16/skipped macroblocks
-  int header_bytes[2];    // approximative number of bytes spent for header
+  int header_bytes[2];    // approximate number of bytes spent for header
                          // and mode-partition #0
-  int residual_bytes[3][4];  // approximative number of bytes spent for
+  int residual_bytes[3][4];  // approximate number of bytes spent for
                             // DC/AC/uv coefficients for each (0..3) segments.
  int segment_size[4];    // number of macroblocks in each segments
  int segment_quant[4];   // quantizer values for each segments
  int segment_level[4];   // filtering strength for each segments [0..63]
+
+  int alpha_data_size;    // size of the transparency data
+  int layer_data_size;    // size of the enhancement layer data
 } WebPAuxStats;

 // Signature for output function. Should return 1 if writing was successful.
@@ -128,13 +138,46 @@ typedef struct {
 typedef int (*WebPWriterFunction)(const uint8_t* data, size_t data_size,
                                  const WebPPicture* const picture);

+typedef enum {
+  // chroma sampling
+  WEBP_YUV420 = 0,   // 4:2:0
+  WEBP_YUV422 = 1,   // 4:2:2
+  WEBP_YUV444 = 2,   // 4:4:4
+  WEBP_YUV400 = 3,   // grayscale
+  WEBP_CSP_UV_MASK = 3,   // bit-mask to get the UV sampling factors
+  // alpha channel variants
+  WEBP_YUV420A = 4,
+  WEBP_YUV422A = 5,
+  WEBP_YUV444A = 6,
+  WEBP_YUV400A = 7,   // grayscale + alpha
+  WEBP_CSP_ALPHA_BIT = 4   // bit that is set if alpha is present
+} WebPEncCSP;
+
+// Encoding error conditions.
+typedef enum {
+  VP8_ENC_OK = 0,
+  VP8_ENC_ERROR_OUT_OF_MEMORY,            // memory error allocating objects
+  VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY,  // memory error while flushing bits
+  VP8_ENC_ERROR_NULL_PARAMETER,           // a pointer parameter is NULL
+  VP8_ENC_ERROR_INVALID_CONFIGURATION,    // configuration is invalid
+  VP8_ENC_ERROR_BAD_DIMENSION,            // picture has invalid width/height
+  VP8_ENC_ERROR_PARTITION0_OVERFLOW,      // partition is bigger than 512k
+  VP8_ENC_ERROR_PARTITION_OVERFLOW,       // partition is bigger than 16M
+  VP8_ENC_ERROR_BAD_WRITE,                // error while flushing bytes
+  VP8_ENC_ERROR_FILE_TOO_BIG,             // file is bigger than 4G
+} WebPEncodingError;
+
+// maximum width/height allowed (inclusive), in pixels
+#define WEBP_MAX_DIMENSION 16383
+
 struct WebPPicture {
  // input
-  int colorspace;            // colorspace: should be 0 for now (=Y'CbCr).
-  int width, height;         // dimensions.
+  WebPEncCSP colorspace;     // colorspace: should be YUV420 for now (=Y'CbCr).
+  int width, height;         // dimensions (less or equal to WEBP_MAX_DIMENSION)
  uint8_t *y, *u, *v;        // pointers to luma/chroma planes.
  int y_stride, uv_stride;   // luma/chroma strides.
-  uint8_t *a;                // pointer to the alpha plane (unused for now).
+  uint8_t *a;                // pointer to the alpha plane
+  int a_stride;              // stride of the alpha plane

  // output
  WebPWriterFunction writer;  // can be NULL
@@ -152,10 +195,16 @@ struct WebPPicture {

  // where to store statistics, if not NULL:
  WebPAuxStats* stats;
+
+  // original samples (for non-YUV420 modes)
+  uint8_t *u0, *v0;
+  int uv0_stride;
+
+  WebPEncodingError error_code;   // error code in case of problem.
 };

 // Internal, version-checked, entry point
-int WebPPictureInitInternal(WebPPicture* const, int);
+WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture* const, int);

 // Should always be called, to initialize the structure. Returns 0 in case of
 // version mismatch. WebPPictureInit() must have succeeded before using the
@@ -164,54 +213,64 @@ static inline int WebPPictureInit(WebPPicture* const picture) {
  return WebPPictureInitInternal(picture, WEBP_ENCODER_ABI_VERSION);
 }

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // WebPPicture utils

 // Convenience allocation / deallocation based on picture->width/height:
-// Allocate y/u/v buffers as per width/height specification.
+// Allocate y/u/v buffers as per colorspace/width/height specification.
 // Note! This function will free the previous buffer if needed.
 // Returns 0 in case of memory error.
-int WebPPictureAlloc(WebPPicture* const picture);
+WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* const picture);

 // Release memory allocated by WebPPictureAlloc() or WebPPictureImport*()
 // Note that this function does _not_ free the memory pointed to by 'picture'.
-void WebPPictureFree(WebPPicture* const picture);
+WEBP_EXTERN(void) WebPPictureFree(WebPPicture* const picture);

 // Copy the pixels of *src into *dst, using WebPPictureAlloc.
 // Returns 0 in case of memory allocation error.
-int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst);
+WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* const src,
+                                 WebPPicture* const dst);

 // self-crops a picture to the rectangle defined by top/left/width/height.
 // Returns 0 in case of memory allocation error, or if the rectangle is
 // outside of the source picture.
-int WebPPictureCrop(WebPPicture* const picture,
-                     int left, int top, int width, int height);
+WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* const picture,
+                                 int left, int top, int width, int height);

-// Colorspace conversion function. Previous buffer will be free'd, if any.
+// Rescale a picture to new dimension width x height.
+// Now gamma correction is applied.
+// Returns false in case of error (invalid parameter or insufficient memory).
+WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* const pic,
+                                    int width, int height);
+
+// Colorspace conversion function to import RGB samples.
+// Previous buffer will be free'd, if any.
 // *rgb buffer should have a size of at least height * rgb_stride.
 // Returns 0 in case of memory error.
-int WebPPictureImportRGB(WebPPicture* const picture,
-                         const uint8_t* const rgb, int rgb_stride);
-// Same, but for RGBA buffer. Alpha information is ignored.
-int WebPPictureImportRGBA(WebPPicture* const picture,
-                          const uint8_t* const rgba, int rgba_stride);
+WEBP_EXTERN(int) WebPPictureImportRGB(
+    WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride);
+// Same, but for RGBA buffer
+WEBP_EXTERN(int) WebPPictureImportRGBA(
+    WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride);

-// Variant of the above, but taking BGR input:
-int WebPPictureImportBGR(WebPPicture* const picture,
-                         const uint8_t* const bgr, int bgr_stride);
-int WebPPictureImportBGRA(WebPPicture* const picture,
-                          const uint8_t* const bgra, int bgra_stride);
+// Variant of the above, but taking BGR(A) input:
+WEBP_EXTERN(int) WebPPictureImportBGR(
+    WebPPicture* const picture, const uint8_t* const bgr, int bgr_stride);
+WEBP_EXTERN(int) WebPPictureImportBGRA(
+    WebPPicture* const picture, const uint8_t* const bgra, int bgra_stride);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
 // Main call

-// Main encoding call, after config and picture have been initialiazed.
-// 'picture' must be less than 16384x16384 in dimension, and the 'config' object
-// must be a valid one.
+// Main encoding call, after config and picture have been initialized.
+// 'picture' must be less than 16384x16384 in dimension (cf WEBP_MAX_DIMENSION),
+// and the 'config' object must be a valid one.
 // Returns false in case of error, true otherwise.
-int WebPEncode(const WebPConfig* const config, WebPPicture* const picture);
+// In case of error, picture->error_code is updated accordingly.
+WEBP_EXTERN(int) WebPEncode(
+    const WebPConfig* const config, WebPPicture* const picture);

-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------

 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
--- a/src/webp/types.h
+++ b/src/webp/types.h
@@ -29,4 +29,10 @@ typedef long long int int64_t;
 #define inline __forceinline
 #endif  /* _MSC_VER */

+#ifndef WEBP_EXTERN
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+#define WEBP_EXTERN(type) extern type
+#endif  /* WEBP_EXTERN */
+
 #endif  /* WEBP_WEBP_TYPES_H_ */
--- a/swig/README
+++ b/swig/README
@@ -0,0 +1,39 @@
+Building:
+=========
+
+JNI SWIG bindings:
+------------------
+ $ gcc -shared -fPIC -fno-strict-aliasing -O2 \
+       -I/path/to/your/jdk/includes \
+       libwebp_java_wrap.c \
+       -lwebp \
+       -o libwebp_jni.so
+
+-------------------------------------- BEGIN PSEUDO EXAMPLE
+import com.google.webp.libwebp;
+
+import java.lang.reflect.Method;
+
+public class libwebp_jni_example {
+  static {
+    System.loadLibrary("webp_jni");
+  }
+
+  /**
+   * usage: java -cp libwebp.jar:. libwebp_jni_example
+   */
+  public static void main(String argv[]) {
+    final int version = libwebp.WebPGetDecoderVersion();
+    System.out.println("libwebp version: " + Integer.toHexString(version));
+
+    System.out.println("libwebp methods:");
+    final Method[] libwebpMethods = libwebp.class.getDeclaredMethods();
+    for (int i = 0; i < libwebpMethods.length; i++) {
+      System.out.println(libwebpMethods[i]);
+    }
+  }
+}
+-------------------------------------- END PSEUDO EXAMPLE
+
+ $ javac -cp libwebp.jar libwebp_jni_example.java
+ $ java -Djava.library.path=. -cp libwebp.jar:. libwebp_jni_example
--- a/swig/libwebp.i
+++ b/swig/libwebp.i
@@ -0,0 +1,232 @@
+// Copyright 2011 Google Inc.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// libwebp swig interface definition
+//
+// Author: James Zern (jzern@google.com)
+//
+// For java bindings compile with:
+//  $ mkdir -p java/com/google/webp
+//  $ swig -ignoremissing -I../src \
+//         -java \
+//         -package com.google.webp \
+//         -outdir java/com/google/webp \
+//         -o libwebp_java_wrap.c libwebp.i
+%module libwebp
+
+%include "constraints.i"
+%include "typemaps.i"
+
+#ifdef SWIGJAVA
+%include "arrays_java.i";
+%include "enums.swg" /*NB: requires JDK-1.5+
+                       See: http://www.swig.org/Doc1.3/Java.html#enumerations */
+
+// map uint8_t* such that a byte[] is used
+// this will generate a few spurious warnings in the wrapper code
+%apply signed char[] { uint8_t * }
+#endif  /* SWIGJAVA */
+
+//------------------------------------------------------------------------------
+// Decoder specific
+
+%apply int *OUTPUT { int *width, int *height }
+%apply int { uint32_t data_size }
+%apply Number NONNEGATIVE { uint32_t data_size }
+
+// free the buffer returned by these functions after copying into
+// the native type
+%newobject WebPDecodeRGB;
+%newobject WebPDecodeRGBA;
+%newobject WebPDecodeARGB;
+%newobject WebPDecodeBGR;
+%newobject WebPDecodeBGRA;
+%typemap(newfree) uint8_t* "free($1);"
+
+int WebPGetDecoderVersion(void);
+int WebPGetInfo(const uint8_t* data, uint32_t data_size,
+                int *width, int *height);
+
+uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size,
+                       int *width, int *height);
+uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size,
+                        int *width, int *height);
+uint8_t* WebPDecodeARGB(const uint8_t* data, uint32_t data_size,
+                        int* width, int* height);
+uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size,
+                       int *width, int *height);
+uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size,
+                        int *width, int *height);
+
+//------------------------------------------------------------------------------
+// Encoder specific
+
+int WebPGetEncoderVersion(void);
+
+//------------------------------------------------------------------------------
+// Wrapper code additions
+
+%{
+#include "webp/decode.h"
+#include "webp/encode.h"
+
+#define FillMeInAsSizeCannotBeDeterminedAutomatically \
+    (result ? returned_buffer_size(__FUNCTION__, arg3, arg4) : 0)
+
+static jint returned_buffer_size(
+    const char *function, int *width, int *height) {
+  static const struct sizemap {
+    const char *function;
+    int size_multiplier;
+  } size_map[] = {
+    { "Java_com_google_webp_libwebpJNI_WebPDecodeRGB",  3 },
+    { "Java_com_google_webp_libwebpJNI_WebPDecodeRGBA", 4 },
+    { "Java_com_google_webp_libwebpJNI_WebPDecodeARGB", 4 },
+    { "Java_com_google_webp_libwebpJNI_WebPDecodeBGR",  3 },
+    { "Java_com_google_webp_libwebpJNI_WebPDecodeBGRA", 4 },
+    { "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeRGB",  1 },
+    { "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeBGR",  1 },
+    { "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeRGBA", 1 },
+    { "Java_com_google_webp_libwebpJNI_wrap_1WebPEncodeBGRA", 1 },
+    { NULL, 0 }
+  };
+  const struct sizemap *p;
+  jint size = -1;
+
+  for (p = size_map; p->function; p++) {
+    if (!strcmp(function, p->function)) {
+      size = *width * *height * p->size_multiplier;
+      break;
+    }
+  }
+
+  return size;
+}
+
+typedef size_t (*WebPEncodeFunction)(const uint8_t* rgb,
+                                     int width, int height, int stride,
+                                     float quality_factor, uint8_t** output);
+
+static uint8_t* encode(const uint8_t* rgb,
+                       int width, int height, int stride,
+                       float quality_factor,
+                       WebPEncodeFunction encfn,
+                       int* output_size, int* unused) {
+  uint8_t *output = NULL;
+  const size_t image_size =
+      encfn(rgb, width, height, stride, quality_factor, &output);
+  // the values of following two will be interpreted by returned_buffer_size()
+  // as 'width' and 'height' in the size calculation.
+  *output_size = image_size;
+  *unused = 1;
+  return image_size ? output : NULL;
+}
+%}
+
+//------------------------------------------------------------------------------
+// libwebp/encode wrapper functions
+
+%apply int *INPUT { int *unused1, int *unused2 }
+%apply int *OUTPUT { int *output_size }
+
+// free the buffer returned by these functions after copying into
+// the native type
+%newobject wrap_WebPEncodeRGB;
+%newobject wrap_WebPEncodeBGR;
+%newobject wrap_WebPEncodeRGBA;
+%newobject wrap_WebPEncodeBGRA;
+
+#ifdef SWIGJAVA
+// There's no reason to call these directly
+%javamethodmodifiers wrap_WebPEncodeRGB "private";
+%javamethodmodifiers wrap_WebPEncodeBGR "private";
+%javamethodmodifiers wrap_WebPEncodeRGBA "private";
+%javamethodmodifiers wrap_WebPEncodeBGRA "private";
+#endif  /* SWIGJAVA */
+
+%inline %{
+// Changes the return type of WebPEncode* to more closely match Decode*.
+// This also makes it easier to wrap the output buffer in a native type rather
+// than dealing with the return pointer.
+// The additional parameters are to allow reuse of returned_buffer_size(),
+// unused2 and output_size will be used in this case.
+static uint8_t* wrap_WebPEncodeRGB(
+    const uint8_t* rgb, int* unused1, int* unused2, int* output_size,
+    int width, int height, int stride, float quality_factor) {
+  return encode(rgb, width, height, stride, quality_factor,
+                WebPEncodeRGB, output_size, unused2);
+}
+
+static uint8_t* wrap_WebPEncodeBGR(
+    const uint8_t* bgr, int* unused1, int* unused2, int* output_size,
+    int width, int height, int stride, float quality_factor) {
+  return encode(bgr, width, height, stride, quality_factor,
+                WebPEncodeBGR, output_size, unused2);
+}
+
+static uint8_t* wrap_WebPEncodeRGBA(
+    const uint8_t* rgba, int* unused1, int* unused2, int* output_size,
+    int width, int height, int stride, float quality_factor) {
+  return encode(rgba, width, height, stride, quality_factor,
+                WebPEncodeRGBA, output_size, unused2);
+}
+
+static uint8_t* wrap_WebPEncodeBGRA(
+    const uint8_t* bgra, int* unused1, int* unused2, int* output_size,
+    int width, int height, int stride, float quality_factor) {
+  return encode(bgra, width, height, stride, quality_factor,
+                WebPEncodeBGRA, output_size, unused2);
+}
+%}
+
+//------------------------------------------------------------------------------
+// Language specific
+
+#ifdef SWIGJAVA
+%{
+/* Work around broken gcj jni.h */
+#ifdef __GCJ_JNI_H__
+# undef JNIEXPORT
+# define JNIEXPORT
+# undef JNICALL
+# define JNICALL
+#endif
+%}
+
+%pragma(java) modulecode=%{
+  private static final int UNUSED = 1;
+  private static int outputSize[] = { 0 };
+
+  public static byte[] WebPEncodeRGB(byte[] rgb,
+                                     int width, int height, int stride,
+                                     float quality_factor) {
+    return wrap_WebPEncodeRGB(
+        rgb, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
+  }
+
+  public static byte[] WebPEncodeBGR(byte[] bgr,
+                                     int width, int height, int stride,
+                                     float quality_factor) {
+    return wrap_WebPEncodeBGR(
+        bgr, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
+  }
+
+  public static byte[] WebPEncodeRGBA(byte[] rgba,
+                                      int width, int height, int stride,
+                                      float quality_factor) {
+    return wrap_WebPEncodeRGBA(
+        rgba, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
+  }
+
+  public static byte[] WebPEncodeBGRA(byte[] bgra,
+                                      int width, int height, int stride,
+                                      float quality_factor) {
+    return wrap_WebPEncodeBGRA(
+        bgra, UNUSED, UNUSED, outputSize, width, height, stride, quality_factor);
+  }
+%}
+#endif  /* SWIGJAVA */
--- a/swig/libwebp.jar
+++ b/swig/libwebp.jar
--- a/swig/libwebp_java_wrap.c
+++ b/swig/libwebp_java_wrap.c