resolved conflicts, updated retina class interface and optimized a heavy retinacolor process

2013-04-29 19:06:35 +02:00
parent 9549949228 416fb50594
commit 72742f5316
2166 changed files with 342437 additions and 211189 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,27 +1,36 @@
 .git*       export-ignore

-*           text=auto
-*           whitespace=!indent,trail,space
+*           text=auto whitespace=trailing-space,space-before-tab,-indent-with-non-tab,tab-in-indent,tabwidth=4

-*.py        text whitespace=tab-in-indent,trail,space,fix
-*.cpp       text whitespace=tab-in-indent,trail,space,fix
-*.hpp       text whitespace=tab-in-indent,trail,space,fix
-*.cxx       text whitespace=tab-in-indent,trail,space,fix
-*.hxx       text whitespace=tab-in-indent,trail,space,fix
-*.mm        text whitespace=tab-in-indent,trail,space,fix
-*.c         text whitespace=tab-in-indent,trail,space,fix
-*.h         text whitespace=tab-in-indent,trail,space,fix
-*.i         text whitespace=tab-in-indent,trail,space,fix
-*.java      text whitespace=tab-in-indent,trail,space,fix
-*.cu        text whitespace=tab-in-indent,trail,space,fix
-*.cl        text whitespace=tab-in-indent,trail,space,fix
+*.py        text
+*.cpp       text
+*.hpp       text
+*.cxx       text
+*.hxx       text
+*.mm        text
+*.c         text
+*.h         text
+*.i         text
+*.js        text
+*.java      text
+*.scala     text
+*.cu        text
+*.cl        text
+*.css_t     text
+*.qrc       text
+*.qss       text
+*.S         text
+*.rst       text
+*.tex       text
+*.sty       text

-*.cmake     text whitespace=tab-in-indent,trail,space,fix
-*.cmakein   text whitespace=tab-in-indent,trail,space,fix
-*.in        text whitespace=tab-in-indent,trail,space,fix
-CMakeLists.txt  text whitespace=tab-in-indent,trail,space,fix
+*.aidl      text
+*.mk        text

-*.rst       text whitespace=tab-in-indent,trail,space,fix
+*.cmake         text whitespace=tabwidth=2
+*.cmakein       text whitespace=tabwidth=2
+*.in            text whitespace=tabwidth=2
+CMakeLists.txt  text whitespace=tabwidth=2

 *.png       binary
 *.jepg      binary
@@ -32,22 +41,21 @@ CMakeLists.txt  text whitespace=tab-in-indent,trail,space,fix
 *.a         binary
 *.so        binary
 *.dll       binary
+*.jar       binary

 *.pdf       binary
 *.pbxproj   binary
 *.vec       binary
 *.doc       binary

-*.css_t     text
-*.qrc       text
-*.qss       text
-*.S         text
-
-*.xml       -text
-*.yml       -text
+*.xml                      -text whitespace=cr-at-eol
+*.yml                      -text whitespace=cr-at-eol
+.project                   -text whitespace=cr-at-eol merge=union
+.classpath                 -text whitespace=cr-at-eol merge=union
+.cproject                  -text whitespace=cr-at-eol merge=union
+org.eclipse.jdt.core.prefs -text whitespace=cr-at-eol merge=union

 *.vcproj    text eol=crlf merge=union
-*.cproject  text eol=crlf merge=union
 *.bat       text eol=crlf
 *.cmd       text eol=crlf
 *.cmd.tmpl  text eol=crlf
--- a/3rdparty/ffmpeg/ffmpeg_version.cmake
+++ b/3rdparty/ffmpeg/ffmpeg_version.cmake
@@ -1,3 +1,4 @@
+set(HAVE_FFMPEG 1)
 set(NEW_FFMPEG 1)
 set(HAVE_FFMPEG_CODEC 1)
 set(HAVE_FFMPEG_FORMAT 1)
--- a/3rdparty/ffmpeg/opencv_ffmpeg.dll
+++ b/3rdparty/ffmpeg/opencv_ffmpeg.dll
--- a/3rdparty/ffmpeg/opencv_ffmpeg_64.dll
+++ b/3rdparty/ffmpeg/opencv_ffmpeg_64.dll
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r2.3.3.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/armeabi-v7a/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/armeabi/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/mips/libnative_camera_r4.0.3.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.0.3.so
--- a/3rdparty/lib/mips/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/mips/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/mips/libnative_camera_r4.2.0.so
--- a/3rdparty/lib/x86/libnative_camera_r4.1.1.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.1.1.so
--- a/3rdparty/lib/x86/libnative_camera_r4.2.0.so
+++ b/3rdparty/lib/x86/libnative_camera_r4.2.0.so
--- a/3rdparty/libjasper/CMakeLists.txt
+++ b/3rdparty/libjasper/CMakeLists.txt
@@ -24,6 +24,7 @@ if(WIN32 AND NOT MINGW)
 endif(WIN32 AND NOT MINGW)

 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-implicit-function-declaration -Wno-uninitialized -Wmissing-prototypes -Wmissing-declarations -Wunused -Wshadow -Wsign-compare)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4018 /wd4101 /wd4244 /wd4267 /wd4715) # vs2005

 if(UNIX)
--- a/3rdparty/libjpeg/CMakeLists.txt
+++ b/3rdparty/libjpeg/CMakeLists.txt
@@ -9,6 +9,12 @@ ocv_include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 file(GLOB lib_srcs *.c)
 file(GLOB lib_hdrs *.h)

+if(ANDROID OR IOS)
+  ocv_list_filterout(lib_srcs jmemansi.c)
+else()
+  ocv_list_filterout(lib_srcs jmemnobs.c)
+endif()
+
 # ----------------------------------------------------------------------------------
 #         Define the library target:
 # ----------------------------------------------------------------------------------
@@ -26,6 +32,8 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 endif()

 ocv_warnings_disable(CMAKE_C_FLAGS -Wcast-align -Wshadow -Wunused)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
+ocv_warnings_disable(CMAKE_C_FLAGS /wd4013 /wd4244 /wd4267) # vs2005

 set_target_properties(${JPEG_LIBRARY}
  PROPERTIES OUTPUT_NAME ${JPEG_LIBRARY}
--- a/3rdparty/libjpeg/README
+++ b/3rdparty/libjpeg/README
@@ -1,24 +1,20 @@
 The Independent JPEG Group's JPEG software
 ==========================================

-README for release 6b of 27-Mar-1998
-====================================
+README for release 9 of 13-Jan-2013
+===================================

-This distribution contains the sixth public release of the Independent JPEG
+This distribution contains the ninth public release of the Independent JPEG
 Group's free JPEG software.  You are welcome to redistribute this software and
 to use it for any purpose, subject to the conditions under LEGAL ISSUES, below.

-Serious users of this software (particularly those incorporating it into
-larger programs) should contact IJG at jpeg-info@uunet.uu.net to be added to
-our electronic mailing list.  Mailing list members are notified of updates
-and have a chance to participate in technical discussions, etc.
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.

-This software is the work of Tom Lane, Philip Gladstone, Jim Boucher,
-Lee Crocker, Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi,
-Guido Vollbeding, Ge' Weijers, and other members of the Independent JPEG
-Group.
-
-IJG is not affiliated with the official ISO JPEG standards committee.
+IJG is not affiliated with the ISO/IEC JTC1/SC29/WG1 standards committee
+(also known as JPEG, together with ITU-T SG16).


 DOCUMENTATION ROADMAP
@@ -30,27 +26,27 @@ OVERVIEW            General description of JPEG and the IJG software.
 LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
 REFERENCES          Where to learn more about JPEG.
 ARCHIVE LOCATIONS   Where to find newer versions of this software.
-RELATED SOFTWARE    Other stuff you should get.
+ACKNOWLEDGMENTS     Special thanks.
 FILE FORMAT WARS    Software *not* to get.
 TO DO               Plans for future IJG releases.

 Other documentation files in the distribution are:

 User documentation:
-  install.doc       How to configure and install the IJG software.
-  usage.doc         Usage instructions for cjpeg, djpeg, jpegtran,
+  install.txt       How to configure and install the IJG software.
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
                    rdjpgcom, and wrjpgcom.
-  *.1               Unix-style man pages for programs (same info as usage.doc).
-  wizard.doc        Advanced usage instructions for JPEG wizards only.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
  change.log        Version-to-version change highlights.
 Programmer and internal documentation:
-  libjpeg.doc       How to use the JPEG library in your own programs.
+  libjpeg.txt       How to use the JPEG library in your own programs.
  example.c         Sample code for calling the JPEG library.
-  structure.doc     Overview of the JPEG library's internal structure.
-  filelist.doc      Road map of IJG files.
-  coderules.doc     Coding style rules --- please read if you contribute code.
+  structure.txt     Overview of the JPEG library's internal structure.
+  filelist.txt      Road map of IJG files.
+  coderules.txt     Coding style rules --- please read if you contribute code.

-Please read at least the files install.doc and usage.doc.  Useful information
+Please read at least the files install.txt and usage.txt.  Some information
 can also be found in the JPEG FAQ (Frequently Asked Questions) article.  See
 ARCHIVE LOCATIONS below to find out where to obtain the FAQ article.

@@ -62,24 +58,15 @@ the order listed) before diving into the code.
 OVERVIEW
 ========

-This package contains C software to implement JPEG image compression and
-decompression.  JPEG (pronounced "jay-peg") is a standardized compression
-method for full-color and gray-scale images.  JPEG is intended for compressing
-"real-world" scenes; line drawings, cartoons and other non-realistic images
-are not its strong suit.  JPEG is lossy, meaning that the output image is not
-exactly identical to the input image.  Hence you must not use JPEG if you
-have to have identical output bits.  However, on typical photographic images,
-very good compression levels can be obtained with no visible change, and
-remarkably high compression levels are possible if you can tolerate a
-low-quality image.  For more details, see the references, or just experiment
-with various compression settings.
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and gray-scale images.

 This software implements JPEG baseline, extended-sequential, and progressive
 compression processes.  Provision is made for supporting all variants of these
 processes, although some uncommon parameter settings aren't implemented yet.
-For legal reasons, we are not distributing code for the arithmetic-coding
-variants of JPEG; see LEGAL ISSUES.  We have made no provision for supporting
-the hierarchical or lossless processes defined in the standard.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.

 We provide a set of library routines for reading and writing JPEG image files,
 plus two sample applications "cjpeg" and "djpeg", which use the library to
@@ -91,10 +78,11 @@ considerable functionality beyond the bare JPEG coding/decoding capability;
 for example, the color quantization modules are not strictly part of JPEG
 decoding, but they are essential for output to colormapped file formats or
 colormapped displays.  These extra functions can be compiled out of the
-library if not required for a particular application.  We have also included
-"jpegtran", a utility for lossless transcoding between different JPEG
-processes, and "rdjpgcom" and "wrjpgcom", two simple applications for
-inserting and extracting textual comments in JFIF files.
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.

 The emphasis in designing this software has been on achieving portability and
 flexibility, while also making it fast enough to be useful.  In particular,
@@ -127,7 +115,7 @@ with respect to this software, its quality, accuracy, merchantability, or
 fitness for a particular purpose.  This software is provided "AS IS", and you,
 its user, assume the entire risk as to its quality and accuracy.

-This software is copyright (C) 1991-1998, Thomas G. Lane.
+This software is copyright (C) 1991-2013, Thomas G. Lane, Guido Vollbeding.
 All Rights Reserved except as specified below.

 Permission is hereby granted to use, copy, modify, and distribute this
@@ -158,29 +146,11 @@ commercial products, provided that all warranty or liability claims are
 assumed by the product vendor.


-ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
-sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA.
-ansi2knr.c is NOT covered by the above copyright and conditions, but instead
-by the usual distribution terms of the Free Software Foundation; principally,
-that you must include source code if you redistribute it.  (See the file
-ansi2knr.c for full details.)  However, since ansi2knr.c is not needed as part
-of any program generated from the IJG code, this does not limit you more than
-the foregoing paragraphs do.
-
 The Unix configuration script "configure" was produced with GNU Autoconf.
 It is copyright by the Free Software Foundation but is freely distributable.
 The same holds for its supporting scripts (config.guess, config.sub,
-ltconfig, ltmain.sh).  Another support script, install-sh, is copyright
-by M.I.T. but is also freely distributable.
-
-It appears that the arithmetic coding option of the JPEG spec is covered by
-patents owned by IBM, AT&T, and Mitsubishi.  Hence arithmetic coding cannot
-legally be used without obtaining one or more licenses.  For this reason,
-support for arithmetic coding has been removed from the free JPEG software.
-(Since arithmetic coding provides only a marginal gain over the unpatented
-Huffman mode, it is unlikely that very many implementations will support it.)
-So far as we are aware, there are no patent restrictions on the remaining
-code.
+ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+but is also freely distributable.

 The IJG distribution formerly included code to read and write GIF files.
 To avoid entanglement with the Unisys LZW patent, GIF reading support has
@@ -198,7 +168,7 @@ We are required to state that
 REFERENCES
 ==========

-We highly recommend reading one or more of these references before trying to
+We recommend reading one or more of these references before trying to
 understand the innards of the JPEG software.

 The best short technical introduction to the JPEG compression algorithm is
@@ -207,7 +177,7 @@ The best short technical introduction to the JPEG compression algorithm is
 (Adjacent articles in that issue discuss MPEG motion picture compression,
 applications of JPEG, and related topics.)  If you don't have the CACM issue
 handy, a PostScript file containing a revised version of Wallace's article is
-available at ftp://ftp.uu.net/graphics/jpeg/wallace.ps.gz.  The file (actually
+available at http://www.ijg.org/files/wallace.ps.gz.  The file (actually
 a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
 omits the sample images that appeared in CACM, but it includes corrections
 and some added material.  Note: the Wallace article is copyright ACM and IEEE,
@@ -222,82 +192,71 @@ code but don't know much about data compression in general.  The book's JPEG
 sample code is far from industrial-strength, but when you are ready to look
 at a full implementation, you've got one here...

-The best full description of JPEG is the textbook "JPEG Still Image Data
-Compression Standard" by William B. Pennebaker and Joan L. Mitchell, published
-by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.  Price US$59.95, 638 pp.
-The book includes the complete text of the ISO JPEG standards (DIS 10918-1
-and draft DIS 10918-2).  This is by far the most complete exposition of JPEG
-in existence, and we highly recommend it.
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+Although this is by far the most detailed and comprehensive exposition of
+JPEG publicly available, we point out that it is still missing an explanation
+of the most essential properties and algorithms of the underlying DCT
+technology.
+If you think that you know about DCT-based JPEG after reading this book,
+then you are in delusion.  The real fundamentals and corresponding potential
+of DCT-based JPEG are not publicly known so far, and that is the reason for
+all the mistaken developments taking place in the image coding domain.

-The JPEG standard itself is not available electronically; you must order a
-paper copy through ISO or ITU.  (Unless you feel a need to own a certified
-official copy, we recommend buying the Pennebaker and Mitchell book instead;
-it's much cheaper and includes a great deal of useful explanatory material.)
-In the USA, copies of the standard may be ordered from ANSI Sales at (212)
-642-4900, or from Global Engineering Documents at (800) 854-7179.  (ANSI
-doesn't take credit card orders, but Global does.)  It's not cheap: as of
-1992, ANSI was charging $95 for Part 1 and $47 for Part 2, plus 7%
-shipping/handling.  The standard is divided into two parts, Part 1 being the
-actual specification, while Part 2 covers compliance testing methods.  Part 1
-is titled "Digital Compression and Coding of Continuous-tone Still Images,
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
 Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
 10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
 Continuous-tone Still Images, Part 2: Compliance testing" and has document
 numbers ISO/IEC IS 10918-2, ITU-T T.83.
-
-Some extensions to the original JPEG standard are defined in JPEG Part 3,
-a newer ISO standard numbered ISO/IEC IS 10918-3 and ITU-T T.84.  IJG
-currently does not support any Part 3 extensions.
+IJG JPEG 8 introduced an implementation of the JPEG SmartScale extension
+which is specified in two documents:  A contributed document at ITU and ISO
+with title "ITU-T JPEG-Plus Proposal for Extending ITU-T T.81 for Advanced
+Image Coding", April 2006, Geneva, Switzerland.  The latest version of this
+document is Revision 3.  And a contributed document ISO/IEC JTC1/SC29/WG1 N
+5799 with title "Evolution of JPEG", June/July 2011, Berlin, Germany.
+IJG JPEG 9 introduces a reversible color transform for improved lossless
+compression which is described in a contributed document ISO/IEC JTC1/SC29/
+WG1 N 6080 with title "JPEG 9 Lossless Coding", June/July 2012, Paris,
+France.

 The JPEG standard does not specify all details of an interchangeable file
 format.  For the omitted details we follow the "JFIF" conventions, revision
-1.02.  A copy of the JFIF spec is available from:
-	Literature Department
-	C-Cube Microsystems, Inc.
-	1778 McCarthy Blvd.
-	Milpitas, CA 95035
-	phone (408) 944-6300,  fax (408) 944-6314
-A PostScript version of this document is available by FTP at
-ftp://ftp.uu.net/graphics/jpeg/jfif.ps.gz.  There is also a plain text
-version at ftp://ftp.uu.net/graphics/jpeg/jfif.txt.gz, but it is missing
-the figures.
+1.02.  JFIF 1.02 has been adopted as an Ecma International Technical Report
+and thus received a formal publication status.  It is available as a free
+download in PDF format from
+http://www.ecma-international.org/publications/techreports/E-TR-098.htm.
+A PostScript version of the JFIF document is available at
+http://www.ijg.org/files/jfif.ps.gz.  There is also a plain text version at
+http://www.ijg.org/files/jfif.txt.gz, but it is missing the figures.

 The TIFF 6.0 file format specification can be obtained by FTP from
 ftp://ftp.sgi.com/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation scheme
 found in the TIFF 6.0 spec of 3-June-92 has a number of serious problems.
 IJG does not recommend use of the TIFF 6.0 design (TIFF Compression tag 6).
 Instead, we recommend the JPEG design proposed by TIFF Technical Note #2
-(Compression tag 7).  Copies of this Note can be obtained from ftp.sgi.com or
-from ftp://ftp.uu.net/graphics/jpeg/.  It is expected that the next revision
+(Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
 of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
 Although IJG's own code does not support TIFF/JPEG, the free libtiff library
-uses our library to implement TIFF/JPEG per the Note.  libtiff is available
-from ftp://ftp.sgi.com/graphics/tiff/.
+uses our library to implement TIFF/JPEG per the Note.


 ARCHIVE LOCATIONS
 =================

-The "official" archive site for this software is ftp.uu.net (Internet
-address 192.48.96.9).  The most recent released version can always be found
-there in directory graphics/jpeg.  This particular version will be archived
-as ftp://ftp.uu.net/graphics/jpeg/jpegsrc.v6b.tar.gz.  If you don't have
-direct Internet access, UUNET's archives are also available via UUCP; contact
-help@uunet.uu.net for information on retrieving files that way.
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".  This particular version will be archived as
+http://www.ijg.org/files/jpegsrc.v9.tar.gz, and in Windows-compatible
+"zip" archive format as http://www.ijg.org/files/jpegsr9.zip.

-Numerous Internet sites maintain copies of the UUNET files.  However, only
-ftp.uu.net is guaranteed to have the latest official version.
-
-You can also obtain this software in DOS-compatible "zip" archive format from
-the SimTel archives (ftp://ftp.simtel.net/pub/simtelnet/msdos/graphics/), or
-on CompuServe in the Graphics Support forum (GO CIS:GRAPHSUP), library 12
-"JPEG Tools".  Again, these versions may sometimes lag behind the ftp.uu.net
-release.
-
-The JPEG FAQ (Frequently Asked Questions) article is a useful source of
-general information about JPEG.  It is updated constantly and therefore is
-not included in this distribution.  The FAQ is posted every two weeks to
-Usenet newsgroups comp.graphics.misc, news.answers, and other groups.
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.
 It is available on the World Wide Web at http://www.faqs.org/faqs/jpeg-faq/
 and other news.answers archive sites, including the official news.answers
 archive at rtfm.mit.edu: ftp://rtfm.mit.edu/pub/usenet/news.answers/jpeg-faq/.
@@ -307,79 +266,85 @@ with body
 	send usenet/news.answers/jpeg-faq/part2


-RELATED SOFTWARE
-================
+ACKNOWLEDGMENTS
+===============

-Numerous viewing and image manipulation programs now support JPEG.  (Quite a
-few of them use this library to do so.)  The JPEG FAQ described above lists
-some of the more popular free and shareware viewers, and tells where to
-obtain them on Internet.
+Thank to Juergen Bruder for providing me with a copy of the common DCT
+algorithm article, only to find out that I had come to the same result
+in a more direct and comprehensible way with a more generative approach.

-If you are on a Unix machine, we highly recommend Jef Poskanzer's free
-PBMPLUS software, which provides many useful operations on PPM-format image
-files.  In particular, it can convert PPM images to and from a wide range of
-other formats, thus making cjpeg/djpeg considerably more useful.  The latest
-version is distributed by the NetPBM group, and is available from numerous
-sites, notably ftp://wuarchive.wustl.edu/graphics/graphics/packages/NetPBM/.
-Unfortunately PBMPLUS/NETPBM is not nearly as portable as the IJG software is;
-you are likely to have difficulty making it work on any non-Unix machine.
+Thank to Istvan Sebestyen and Joan L. Mitchell for inviting me to the
+ITU JPEG (Study Group 16) meeting in Geneva, Switzerland.

-A different free JPEG implementation, written by the PVRG group at Stanford,
-is available from ftp://havefun.stanford.edu/pub/jpeg/.  This program
-is designed for research and experimentation rather than production use;
-it is slower, harder to use, and less portable than the IJG code, but it
-is easier to read and modify.  Also, the PVRG code supports lossless JPEG,
-which we do not.  (On the other hand, it doesn't do progressive JPEG.)
+Thank to Thomas Wiegand and Gary Sullivan for inviting me to the
+Joint Video Team (MPEG & ITU) meeting in Geneva, Switzerland.
+
+Thank to Thomas Richter and Daniel Lee for inviting me to the
+ISO/IEC JTC1/SC29/WG1 (also known as JPEG, together with ITU-T SG16)
+meeting in Berlin, Germany.
+
+Thank to John Korejwa and Massimo Ballerini for inviting me to
+fruitful consultations in Boston, MA and Milan, Italy.
+
+Thank to Hendrik Elstner, Roland Fassauer, Simone Zuck, Guenther
+Maier-Gerber, Walter Stoeber, Fred Schmitz, and Norbert Braunagel
+for corresponding business development.
+
+Thank to Nico Zschach and Dirk Stelling of the technical support team
+at the Digital Images company in Halle for providing me with extra
+equipment for configuration tests.
+
+Thank to Richard F. Lyon (then of Foveon Inc.) for fruitful
+communication about JPEG configuration in Sigma Photo Pro software.
+
+Thank to Andrew Finkenstadt for hosting the ijg.org site.
+
+Last but not least special thank to Thomas G. Lane for the original
+design and development of this singular software package.


 FILE FORMAT WARS
 ================

-Some JPEG programs produce files that are not compatible with our library.
-The root of the problem is that the ISO JPEG committee failed to specify a
-concrete file format.  Some vendors "filled in the blanks" on their own,
-creating proprietary formats that no one else could read.  (For example, none
-of the early commercial JPEG implementations for the Macintosh were able to
-exchange compressed files.)
+The ISO/IEC JTC1/SC29/WG1 standards committee (also known as JPEG, together
+with ITU-T SG16) currently promotes different formats containing the name
+"JPEG" which is misleading because these formats are incompatible with
+original DCT-based JPEG and are based on faulty technologies.
+IJG therefore does not and will not support such momentary mistakes
+(see REFERENCES).
+There exist also distributions under the name "OpenJPEG" promoting such
+kind of formats which is misleading because they don't support original
+JPEG images.
+We have no sympathy for the promotion of inferior formats.  Indeed, one of
+the original reasons for developing this free software was to help force
+convergence on common, interoperable format standards for JPEG files.
+Don't use an incompatible file format!
+(In any case, our decoder will remain capable of reading existing JPEG
+image files indefinitely.)

-The file format we have adopted is called JFIF (see REFERENCES).  This format
-has been agreed to by a number of major commercial JPEG vendors, and it has
-become the de facto standard.  JFIF is a minimal or "low end" representation.
-We recommend the use of TIFF/JPEG (TIFF revision 6.0 as modified by TIFF
-Technical Note #2) for "high end" applications that need to record a lot of
-additional data about an image.  TIFF/JPEG is fairly new and not yet widely
-supported, unfortunately.
+Furthermore, the ISO committee pretends to be "responsible for the popular
+JPEG" in their public reports which is not true because they don't respond to
+actual requirements for the maintenance of the original JPEG specification.

-The upcoming JPEG Part 3 standard defines a file format called SPIFF.
-SPIFF is interoperable with JFIF, in the sense that most JFIF decoders should
-be able to read the most common variant of SPIFF.  SPIFF has some technical
-advantages over JFIF, but its major claim to fame is simply that it is an
-official standard rather than an informal one.  At this point it is unclear
-whether SPIFF will supersede JFIF or whether JFIF will remain the de-facto
-standard.  IJG intends to support SPIFF once the standard is frozen, but we
-have not decided whether it should become our default output format or not.
-(In any case, our decoder will remain capable of reading JFIF indefinitely.)
-
-Various proprietary file formats incorporating JPEG compression also exist.
-We have little or no sympathy for the existence of these formats.  Indeed,
-one of the original reasons for developing this free software was to help
-force convergence on common, open format standards for JPEG files.  Don't
-use a proprietary file format!
+There are currently different distributions in circulation containing the
+name "libjpeg" which is misleading because they don't have the features and
+are incompatible with formats supported by actual IJG libjpeg distributions.
+One of those fakes is released by members of the ISO committee and just uses
+the name of libjpeg for misdirection of people, similar to the abuse of the
+name JPEG as described above, while having nothing in common with actual IJG
+libjpeg distributions.
+The other one claims to be a "derivative" or "fork" of the original libjpeg
+and violates the license conditions as described under LEGAL ISSUES above.
+We have no sympathy for the release of misleading and illegal distributions
+derived from obsolete code bases.
+Don't use an obsolete code base!


 TO DO
 =====

-The major thrust for v7 will probably be improvement of visual quality.
-The current method for scaling the quantization tables is known not to be
-very good at low Q values.  We also intend to investigate block boundary
-smoothing, "poor man's variable quantization", and other means of improving
-quality-vs-file-size performance without sacrificing compatibility.
+Version 9 is the second release of a new generation JPEG standard
+to overcome the limitations of the original JPEG specification.
+More features are being prepared for coming releases...

-In future versions, we are considering supporting some of the upcoming JPEG
-Part 3 extensions --- principally, variable quantization and the SPIFF file
-format.
-
-As always, speeding things up is of great interest.
-
-Please send bug reports, offers of help, etc. to jpeg-info@uunet.uu.net.
+Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org.
--- a/3rdparty/libjpeg/change.log
+++ b/3rdparty/libjpeg/change.log
@@ -0,0 +1,382 @@
+CHANGE LOG for Independent JPEG Group's JPEG software
+
+
+Version 9  13-Jan-2013
+----------------------
+
+Add cjpeg -rgb1 option to create an RGB JPEG file, and insert
+a simple reversible color transform into the processing which
+significantly improves the compression.
+The recommended command for lossless coding of RGB images is now
+cjpeg -rgb1 -block 1 -arithmetic.
+As said, this option improves the compression significantly, but
+the files are not compatible with JPEG decoders prior to IJG v9
+due to the included color transform.
+The used color transform and marker signaling is compatible with
+other JPEG standards (e.g., JPEG-LS part 2).
+
+Remove the automatic de-ANSI-fication support (Automake 1.12).
+Thank also to Nitin A Kamble for suggestion.
+
+Add remark for jpeg_mem_dest() in jdatadst.c.
+Thank to Elie-Gregoire Khoury for the hint.
+
+Support files with invalid component identifiers (created
+by Adobe PDF).  Thank to Robin Watts for the suggestion.
+
+Adapt full buffer case in jcmainct.c for use with scaled DCT.
+Thank to Sergii Biloshytskyi for the suggestion.
+
+Add type identifier for declaration of noreturn functions.
+Thank to Brett L. Moore for the suggestion.
+
+Correct argument type in format string, avoid compiler warnings.
+Thank to Vincent Torri for hint.
+
+Add missing #include directives in configuration checks, avoid
+configuration errors.  Thank to John Spencer for the hint.
+
+
+Version 8d  15-Jan-2012
+-----------------------
+
+Add cjpeg -rgb option to create RGB JPEG files.
+Using this switch suppresses the conversion from RGB
+colorspace input to the default YCbCr JPEG colorspace.
+This feature allows true lossless JPEG coding of RGB color images.
+The recommended command for this purpose is currently
+cjpeg -rgb -block 1 -arithmetic.
+SmartScale capable decoder (introduced with IJG JPEG 8) required.
+Thank to Michael Koch for the initial suggestion.
+
+Add option to disable the region adjustment in the transupp crop code.
+Thank to Jeffrey Friedl for the suggestion.
+
+Thank to Richard Jones and Edd Dawson for various minor corrections.
+
+Thank to Akim Demaille for configure.ac cleanup.
+
+
+Version 8c  16-Jan-2011
+-----------------------
+
+Add option to compression library and cjpeg (-block N) to use
+different DCT block size.
+All N from 1 to 16 are possible.  Default is 8 (baseline format).
+Larger values produce higher compression,
+smaller values produce higher quality.
+SmartScale capable decoder (introduced with IJG JPEG 8) required.
+
+
+Version 8b  16-May-2010
+-----------------------
+
+Repair problem in new memory source manager with corrupt JPEG data.
+Thank to Ted Campbell and Samuel Chun for the report.
+
+Repair problem in Makefile.am test target.
+Thank to anonymous user for the report.
+
+Support MinGW installation with automatic configure.
+Thank to Volker Grabsch for the suggestion.
+
+
+Version 8a  28-Feb-2010
+-----------------------
+
+Writing tables-only datastreams via jpeg_write_tables works again.
+
+Support 32-bit BMPs (RGB image with Alpha channel) for read in cjpeg.
+Thank to Brett Blackham for the suggestion.
+
+Improve accuracy in floating point IDCT calculation.
+Thank to Robert Hooke for the hint.
+
+
+Version 8  10-Jan-2010
+----------------------
+
+jpegtran now supports the same -scale option as djpeg for "lossless" resize.
+An implementation of the JPEG SmartScale extension is required for this
+feature.  A (draft) specification of the JPEG SmartScale extension is
+available as a contributed document at ITU and ISO.  Revision 2 or later
+of the document is required (latest document version is Revision 3).
+The SmartScale extension will enable more features beside lossless resize
+in future implementations, as described in the document (new compression
+options).
+
+Add sanity check in BMP reader module to avoid cjpeg crash for empty input
+image (thank to Isaev Ildar of ISP RAS, Moscow, RU for reporting this error).
+
+Add data source and destination managers for read from and write to
+memory buffers.  New API functions jpeg_mem_src and jpeg_mem_dest.
+Thank to Roberto Boni from Italy for the suggestion.
+
+
+Version 7  27-Jun-2009
+----------------------
+
+New scaled DCTs implemented.
+djpeg now supports scalings N/8 with all N from 1 to 16.
+cjpeg now supports scalings 8/N with all N from 1 to 16.
+Scaled DCTs with size larger than 8 are now also used for resolving the
+common 2x2 chroma subsampling case without additional spatial resampling.
+Separate spatial resampling for those kind of files is now only necessary
+for N>8 scaling cases.
+Furthermore, separate scaled DCT functions are provided for direct resolving
+of the common asymmetric subsampling cases (2x1 and 1x2) without additional
+spatial resampling.
+
+cjpeg -quality option has been extended for support of separate quality
+settings for luminance and chrominance (or in general, for every provided
+quantization table slot).
+New API function jpeg_default_qtables() and q_scale_factor array in library.
+
+Added -nosmooth option to cjpeg, complementary to djpeg.
+New variable "do_fancy_downsampling" in library, complement to fancy
+upsampling.  Fancy upsampling now uses direct DCT scaling with sizes
+larger than 8.  The old method is not reversible and has been removed.
+
+Support arithmetic entropy encoding and decoding.
+Added files jaricom.c, jcarith.c, jdarith.c.
+
+Straighten the file structure:
+Removed files jidctred.c, jcphuff.c, jchuff.h, jdphuff.c, jdhuff.h.
+
+jpegtran has a new "lossless" cropping feature.
+
+Implement -perfect option in jpegtran, new API function
+jtransform_perfect_transform() in transupp. (DP 204_perfect.dpatch)
+
+Better error messages for jpegtran fopen failure.
+(DP 203_jpegtran_errmsg.dpatch)
+
+Fix byte order issue with 16bit PPM/PGM files in rdppm.c/wrppm.c:
+according to Netpbm, the de facto standard implementation of the PNM formats,
+the most significant byte is first. (DP 203_rdppm.dpatch)
+
+Add -raw option to rdjpgcom not to mangle the output.
+(DP 205_rdjpgcom_raw.dpatch)
+
+Make rdjpgcom locale aware. (DP 201_rdjpgcom_locale.dpatch)
+
+Add extern "C" to jpeglib.h.
+This avoids the need to put extern "C" { ... } around #include "jpeglib.h"
+in your C++ application.  Defining the symbol DONT_USE_EXTERN_C in the
+configuration prevents this. (DP 202_jpeglib.h_c++.dpatch)
+
+
+Version 6b  27-Mar-1998
+-----------------------
+
+jpegtran has new features for lossless image transformations (rotation
+and flipping) as well as "lossless" reduction to grayscale.
+
+jpegtran now copies comments by default; it has a -copy switch to enable
+copying all APPn blocks as well, or to suppress comments.  (Formerly it
+always suppressed comments and APPn blocks.)  jpegtran now also preserves
+JFIF version and resolution information.
+
+New decompressor library feature: COM and APPn markers found in the input
+file can be saved in memory for later use by the application.  (Before,
+you had to code this up yourself with a custom marker processor.)
+
+There is an unused field "void * client_data" now in compress and decompress
+parameter structs; this may be useful in some applications.
+
+JFIF version number information is now saved by the decoder and accepted by
+the encoder.  jpegtran uses this to copy the source file's version number,
+to ensure "jpegtran -copy all" won't create bogus files that contain JFXX
+extensions but claim to be version 1.01.  Applications that generate their
+own JFXX extension markers also (finally) have a supported way to cause the
+encoder to emit JFIF version number 1.02.
+
+djpeg's trace mode reports JFIF 1.02 thumbnail images as such, rather
+than as unknown APP0 markers.
+
+In -verbose mode, djpeg and rdjpgcom will try to print the contents of
+APP12 markers as text.  Some digital cameras store useful text information
+in APP12 markers.
+
+Handling of truncated data streams is more robust: blocks beyond the one in
+which the error occurs will be output as uniform gray, or left unchanged
+if decoding a progressive JPEG.  The appearance no longer depends on the
+Huffman tables being used.
+
+Huffman tables are checked for validity much more carefully than before.
+
+To avoid the Unisys LZW patent, djpeg's GIF output capability has been
+changed to produce "uncompressed GIFs", and cjpeg's GIF input capability
+has been removed altogether.  We're not happy about it either, but there
+seems to be no good alternative.
+
+The configure script now supports building libjpeg as a shared library
+on many flavors of Unix (all the ones that GNU libtool knows how to
+build shared libraries for).  Use "./configure --enable-shared" to
+try this out.
+
+New jconfig file and makefiles for Microsoft Visual C++ and Developer Studio.
+Also, a jconfig file and a build script for Metrowerks CodeWarrior
+on Apple Macintosh.  makefile.dj has been updated for DJGPP v2, and there
+are miscellaneous other minor improvements in the makefiles.
+
+jmemmac.c now knows how to create temporary files following Mac System 7
+conventions.
+
+djpeg's -map switch is now able to read raw-format PPM files reliably.
+
+cjpeg -progressive -restart no longer generates any unnecessary DRI markers.
+
+Multiple calls to jpeg_simple_progression for a single JPEG object
+no longer leak memory.
+
+
+Version 6a  7-Feb-96
+--------------------
+
+Library initialization sequence modified to detect version mismatches
+and struct field packing mismatches between library and calling application.
+This change requires applications to be recompiled, but does not require
+any application source code change.
+
+All routine declarations changed to the style "GLOBAL(type) name ...",
+that is, GLOBAL, LOCAL, METHODDEF, EXTERN are now macros taking the
+routine's return type as an argument.  This makes it possible to add
+Microsoft-style linkage keywords to all the routines by changing just
+these macros.  Note that any application code that was using these macros
+will have to be changed.
+
+DCT coefficient quantization tables are now stored in normal array order
+rather than zigzag order.  Application code that calls jpeg_add_quant_table,
+or otherwise manipulates quantization tables directly, will need to be
+changed.  If you need to make such code work with either older or newer
+versions of the library, a test like "#if JPEG_LIB_VERSION >= 61" is
+recommended.
+
+djpeg's trace capability now dumps DQT tables in natural order, not zigzag
+order.  This allows the trace output to be made into a "-qtables" file
+more easily.
+
+New system-dependent memory manager module for use on Apple Macintosh.
+
+Fix bug in cjpeg's -smooth option: last one or two scanlines would be
+duplicates of the prior line unless the image height mod 16 was 1 or 2.
+
+Repair minor problems in VMS, BCC, MC6 makefiles.
+
+New configure script based on latest GNU Autoconf.
+
+Correct the list of include files needed by MetroWerks C for ccommand().
+
+Numerous small documentation updates.
+
+
+Version 6  2-Aug-95
+-------------------
+
+Progressive JPEG support: library can read and write full progressive JPEG
+files.  A "buffered image" mode supports incremental decoding for on-the-fly
+display of progressive images.  Simply recompiling an existing IJG-v5-based
+decoder with v6 should allow it to read progressive files, though of course
+without any special progressive display.
+
+New "jpegtran" application performs lossless transcoding between different
+JPEG formats; primarily, it can be used to convert baseline to progressive
+JPEG and vice versa.  In support of jpegtran, the library now allows lossless
+reading and writing of JPEG files as DCT coefficient arrays.  This ability
+may be of use in other applications.
+
+Notes for programmers:
+* We changed jpeg_start_decompress() to be able to suspend; this makes all
+decoding modes available to suspending-input applications.  However,
+existing applications that use suspending input will need to be changed
+to check the return value from jpeg_start_decompress().  You don't need to
+do anything if you don't use a suspending data source.
+* We changed the interface to the virtual array routines: access_virt_array
+routines now take a count of the number of rows to access this time.  The
+last parameter to request_virt_array routines is now interpreted as the
+maximum number of rows that may be accessed at once, but not necessarily
+the height of every access.
+
+
+Version 5b  15-Mar-95
+---------------------
+
+Correct bugs with grayscale images having v_samp_factor > 1.
+
+jpeg_write_raw_data() now supports output suspension.
+
+Correct bugs in "configure" script for case of compiling in
+a directory other than the one containing the source files.
+
+Repair bug in jquant1.c: sometimes didn't use as many colors as it could.
+
+Borland C makefile and jconfig file work under either MS-DOS or OS/2.
+
+Miscellaneous improvements to documentation.
+
+
+Version 5a  7-Dec-94
+--------------------
+
+Changed color conversion roundoff behavior so that grayscale values are
+represented exactly.  (This causes test image files to change.)
+
+Make ordered dither use 16x16 instead of 4x4 pattern for a small quality
+improvement.
+
+New configure script based on latest GNU Autoconf.
+Fix configure script to handle CFLAGS correctly.
+Rename *.auto files to *.cfg, so that configure script still works if
+file names have been truncated for DOS.
+
+Fix bug in rdbmp.c: didn't allow for extra data between header and image.
+
+Modify rdppm.c/wrppm.c to handle 2-byte raw PPM/PGM formats for 12-bit data.
+
+Fix several bugs in rdrle.c.
+
+NEED_SHORT_EXTERNAL_NAMES option was broken.
+
+Revise jerror.h/jerror.c for more flexibility in message table.
+
+Repair oversight in jmemname.c NO_MKTEMP case: file could be there
+but unreadable.
+
+
+Version 5  24-Sep-94
+--------------------
+
+Version 5 represents a nearly complete redesign and rewrite of the IJG
+software.  Major user-visible changes include:
+  * Automatic configuration simplifies installation for most Unix systems.
+  * A range of speed vs. image quality tradeoffs are supported.
+    This includes resizing of an image during decompression: scaling down
+    by a factor of 1/2, 1/4, or 1/8 is handled very efficiently.
+  * New programs rdjpgcom and wrjpgcom allow insertion and extraction
+    of text comments in a JPEG file.
+
+The application programmer's interface to the library has changed completely.
+Notable improvements include:
+  * We have eliminated the use of callback routines for handling the
+    uncompressed image data.  The application now sees the library as a
+    set of routines that it calls to read or write image data on a
+    scanline-by-scanline basis.
+  * The application image data is represented in a conventional interleaved-
+    pixel format, rather than as a separate array for each color channel.
+    This can save a copying step in many programs.
+  * The handling of compressed data has been cleaned up: the application can
+    supply routines to source or sink the compressed data.  It is possible to
+    suspend processing on source/sink buffer overrun, although this is not
+    supported in all operating modes.
+  * All static state has been eliminated from the library, so that multiple
+    instances of compression or decompression can be active concurrently.
+  * JPEG abbreviated datastream formats are supported, ie, quantization and
+    Huffman tables can be stored separately from the image data.
+  * And not only that, but the documentation of the library has improved
+    considerably!
+
+
+The last widely used release before the version 5 rewrite was version 4A of
+18-Feb-93.  Change logs before that point have been discarded, since they
+are not of much interest after the rewrite.
--- a/3rdparty/libjpeg/jaricom.c
+++ b/3rdparty/libjpeg/jaricom.c
@@ -0,0 +1,153 @@
+/*
+ * jaricom.c
+ *
+ * Developed 1997-2011 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains probability estimation tables for common use in
+ * arithmetic entropy encoding and decoding routines.
+ *
+ * This data represents Table D.3 in the JPEG spec (D.2 in the draft),
+ * ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81, and Table 24
+ * in the JBIG spec, ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+/* The following #define specifies the packing of the four components
+ * into the compact INT32 representation.
+ * Note that this formula must match the actual arithmetic encoder
+ * and decoder implementation.  The implementation has to be changed
+ * if this formula is changed.
+ * The current organization is leaned on Markus Kuhn's JBIG
+ * implementation (jbig_tab.c).
+ */
+
+#define V(i,a,b,c,d) (((INT32)a << 16) | ((INT32)c << 8) | ((INT32)d << 7) | b)
+
+const INT32 jpeg_aritab[113+1] = {
+/*
+ * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
+ */
+  V(   0, 0x5a1d,   1,   1, 1 ),
+  V(   1, 0x2586,  14,   2, 0 ),
+  V(   2, 0x1114,  16,   3, 0 ),
+  V(   3, 0x080b,  18,   4, 0 ),
+  V(   4, 0x03d8,  20,   5, 0 ),
+  V(   5, 0x01da,  23,   6, 0 ),
+  V(   6, 0x00e5,  25,   7, 0 ),
+  V(   7, 0x006f,  28,   8, 0 ),
+  V(   8, 0x0036,  30,   9, 0 ),
+  V(   9, 0x001a,  33,  10, 0 ),
+  V(  10, 0x000d,  35,  11, 0 ),
+  V(  11, 0x0006,   9,  12, 0 ),
+  V(  12, 0x0003,  10,  13, 0 ),
+  V(  13, 0x0001,  12,  13, 0 ),
+  V(  14, 0x5a7f,  15,  15, 1 ),
+  V(  15, 0x3f25,  36,  16, 0 ),
+  V(  16, 0x2cf2,  38,  17, 0 ),
+  V(  17, 0x207c,  39,  18, 0 ),
+  V(  18, 0x17b9,  40,  19, 0 ),
+  V(  19, 0x1182,  42,  20, 0 ),
+  V(  20, 0x0cef,  43,  21, 0 ),
+  V(  21, 0x09a1,  45,  22, 0 ),
+  V(  22, 0x072f,  46,  23, 0 ),
+  V(  23, 0x055c,  48,  24, 0 ),
+  V(  24, 0x0406,  49,  25, 0 ),
+  V(  25, 0x0303,  51,  26, 0 ),
+  V(  26, 0x0240,  52,  27, 0 ),
+  V(  27, 0x01b1,  54,  28, 0 ),
+  V(  28, 0x0144,  56,  29, 0 ),
+  V(  29, 0x00f5,  57,  30, 0 ),
+  V(  30, 0x00b7,  59,  31, 0 ),
+  V(  31, 0x008a,  60,  32, 0 ),
+  V(  32, 0x0068,  62,  33, 0 ),
+  V(  33, 0x004e,  63,  34, 0 ),
+  V(  34, 0x003b,  32,  35, 0 ),
+  V(  35, 0x002c,  33,   9, 0 ),
+  V(  36, 0x5ae1,  37,  37, 1 ),
+  V(  37, 0x484c,  64,  38, 0 ),
+  V(  38, 0x3a0d,  65,  39, 0 ),
+  V(  39, 0x2ef1,  67,  40, 0 ),
+  V(  40, 0x261f,  68,  41, 0 ),
+  V(  41, 0x1f33,  69,  42, 0 ),
+  V(  42, 0x19a8,  70,  43, 0 ),
+  V(  43, 0x1518,  72,  44, 0 ),
+  V(  44, 0x1177,  73,  45, 0 ),
+  V(  45, 0x0e74,  74,  46, 0 ),
+  V(  46, 0x0bfb,  75,  47, 0 ),
+  V(  47, 0x09f8,  77,  48, 0 ),
+  V(  48, 0x0861,  78,  49, 0 ),
+  V(  49, 0x0706,  79,  50, 0 ),
+  V(  50, 0x05cd,  48,  51, 0 ),
+  V(  51, 0x04de,  50,  52, 0 ),
+  V(  52, 0x040f,  50,  53, 0 ),
+  V(  53, 0x0363,  51,  54, 0 ),
+  V(  54, 0x02d4,  52,  55, 0 ),
+  V(  55, 0x025c,  53,  56, 0 ),
+  V(  56, 0x01f8,  54,  57, 0 ),
+  V(  57, 0x01a4,  55,  58, 0 ),
+  V(  58, 0x0160,  56,  59, 0 ),
+  V(  59, 0x0125,  57,  60, 0 ),
+  V(  60, 0x00f6,  58,  61, 0 ),
+  V(  61, 0x00cb,  59,  62, 0 ),
+  V(  62, 0x00ab,  61,  63, 0 ),
+  V(  63, 0x008f,  61,  32, 0 ),
+  V(  64, 0x5b12,  65,  65, 1 ),
+  V(  65, 0x4d04,  80,  66, 0 ),
+  V(  66, 0x412c,  81,  67, 0 ),
+  V(  67, 0x37d8,  82,  68, 0 ),
+  V(  68, 0x2fe8,  83,  69, 0 ),
+  V(  69, 0x293c,  84,  70, 0 ),
+  V(  70, 0x2379,  86,  71, 0 ),
+  V(  71, 0x1edf,  87,  72, 0 ),
+  V(  72, 0x1aa9,  87,  73, 0 ),
+  V(  73, 0x174e,  72,  74, 0 ),
+  V(  74, 0x1424,  72,  75, 0 ),
+  V(  75, 0x119c,  74,  76, 0 ),
+  V(  76, 0x0f6b,  74,  77, 0 ),
+  V(  77, 0x0d51,  75,  78, 0 ),
+  V(  78, 0x0bb6,  77,  79, 0 ),
+  V(  79, 0x0a40,  77,  48, 0 ),
+  V(  80, 0x5832,  80,  81, 1 ),
+  V(  81, 0x4d1c,  88,  82, 0 ),
+  V(  82, 0x438e,  89,  83, 0 ),
+  V(  83, 0x3bdd,  90,  84, 0 ),
+  V(  84, 0x34ee,  91,  85, 0 ),
+  V(  85, 0x2eae,  92,  86, 0 ),
+  V(  86, 0x299a,  93,  87, 0 ),
+  V(  87, 0x2516,  86,  71, 0 ),
+  V(  88, 0x5570,  88,  89, 1 ),
+  V(  89, 0x4ca9,  95,  90, 0 ),
+  V(  90, 0x44d9,  96,  91, 0 ),
+  V(  91, 0x3e22,  97,  92, 0 ),
+  V(  92, 0x3824,  99,  93, 0 ),
+  V(  93, 0x32b4,  99,  94, 0 ),
+  V(  94, 0x2e17,  93,  86, 0 ),
+  V(  95, 0x56a8,  95,  96, 1 ),
+  V(  96, 0x4f46, 101,  97, 0 ),
+  V(  97, 0x47e5, 102,  98, 0 ),
+  V(  98, 0x41cf, 103,  99, 0 ),
+  V(  99, 0x3c3d, 104, 100, 0 ),
+  V( 100, 0x375e,  99,  93, 0 ),
+  V( 101, 0x5231, 105, 102, 0 ),
+  V( 102, 0x4c0f, 106, 103, 0 ),
+  V( 103, 0x4639, 107, 104, 0 ),
+  V( 104, 0x415e, 103,  99, 0 ),
+  V( 105, 0x5627, 105, 106, 1 ),
+  V( 106, 0x50e7, 108, 107, 0 ),
+  V( 107, 0x4b85, 109, 103, 0 ),
+  V( 108, 0x5597, 110, 109, 0 ),
+  V( 109, 0x504f, 111, 107, 0 ),
+  V( 110, 0x5a10, 110, 111, 1 ),
+  V( 111, 0x5522, 112, 109, 0 ),
+  V( 112, 0x59eb, 112, 111, 1 ),
+/*
+ * This last entry is used for fixed probability estimate of 0.5
+ * as suggested in Section 10.3 Table 5 of ITU-T Rec. T.851.
+ */
+  V( 113, 0x5a1d, 113, 113, 0 )
+};
--- a/3rdparty/libjpeg/jcapimin.c
+++ b/3rdparty/libjpeg/jcapimin.c
@@ -2,6 +2,7 @@
 * jcapimin.c
 *
 * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2003-2010 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -63,14 +64,21 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)

  cinfo->comp_info = NULL;

-  for (i = 0; i < NUM_QUANT_TBLS; i++)
+  for (i = 0; i < NUM_QUANT_TBLS; i++) {
    cinfo->quant_tbl_ptrs[i] = NULL;
+    cinfo->q_scale_factor[i] = 100;
+  }

  for (i = 0; i < NUM_HUFF_TBLS; i++) {
    cinfo->dc_huff_tbl_ptrs[i] = NULL;
    cinfo->ac_huff_tbl_ptrs[i] = NULL;
  }

+  /* Must do it here for emit_dqt in case jpeg_write_tables is used */
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2-1;
+
  cinfo->script_space = NULL;

  cinfo->input_gamma = 1.0;	/* in case application forgets */
--- a/3rdparty/libjpeg/jcarith.c
+++ b/3rdparty/libjpeg/jcarith.c
@@ -0,0 +1,943 @@
+/*
+ * jcarith.c
+ *
+ * Developed 1997-2012 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy encoder object for arithmetic encoding. */
+
+typedef struct {
+  struct jpeg_entropy_encoder pub; /* public fields */
+
+  INT32 c; /* C register, base of coding interval, layout as in sec. D.1.3 */
+  INT32 a;               /* A register, normalized size of coding interval */
+  INT32 sc;        /* counter for stacked 0xFF values which might overflow */
+  INT32 zc;          /* counter for pending 0x00 output values which might *
+                          * be discarded at the end ("Pacman" termination) */
+  int ct;  /* bit shift counter, determines when next byte will be written */
+  int buffer;                /* buffer for most recent output byte != 0xFF */
+
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+  int next_restart_num;		/* next restart number to write (0-7) */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_encoder;
+
+typedef arith_entropy_encoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+/* NOTE: Uncomment the following #define if you want to use the
+ * given formula for calculating the AC conditioning parameter Kx
+ * for spectral selection progressive coding in section G.1.3.2
+ * of the spec (Kx = Kmin + SRL (8 + Se - Kmin) 4).
+ * Although the spec and P&M authors claim that this "has proven
+ * to give good results for 8 bit precision samples", I'm not
+ * convinced yet that this is really beneficial.
+ * Early tests gave only very marginal compression enhancements
+ * (a few - around 5 or so - bytes even for very large files),
+ * which would turn out rather negative if we'd suppress the
+ * DAC (Define Arithmetic Conditioning) marker segments for
+ * the default parameters in the future.
+ * Note that currently the marker writing module emits 12-byte
+ * DAC segments for a full-component scan in a color image.
+ * This is not worth worrying about IMHO. However, since the
+ * spec defines the default values to be used if the tables
+ * are omitted (unlike Huffman tables, which are required
+ * anyway), one might optimize this behaviour in the future,
+ * and then it would be disadvantageous to use custom tables if
+ * they don't provide sufficient gain to exceed the DAC size.
+ *
+ * On the other hand, I'd consider it as a reasonable result
+ * that the conditioning has no significant influence on the
+ * compression performance. This means that the basic
+ * statistical model is already rather stable.
+ *
+ * Thus, at the moment, we use the default conditioning values
+ * anyway, and do not use the custom formula.
+ *
+#define CALCULATE_SPECTRAL_CONDITIONING
+ */
+
+/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
+ * We assume that int right shift is unsigned if INT32 right shift is,
+ * which should be safe.
+ */
+
+#ifdef RIGHT_SHIFT_IS_UNSIGNED
+#define ISHIFT_TEMPS	int ishift_temp;
+#define IRIGHT_SHIFT(x,shft)  \
+        ((ishift_temp = (x)) < 0 ? \
+         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
+         (ishift_temp >> (shft)))
+#else
+#define ISHIFT_TEMPS
+#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
+#endif
+
+
+LOCAL(void)
+emit_byte (int val, j_compress_ptr cinfo)
+/* Write next output byte; we do not support suspension in this module. */
+{
+  struct jpeg_destination_mgr * dest = cinfo->dest;
+
+  *dest->next_output_byte++ = (JOCTET) val;
+  if (--dest->free_in_buffer == 0)
+    if (! (*dest->empty_output_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+}
+
+
+/*
+ * Finish up at the end of an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+finish_pass (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  INT32 temp;
+
+  /* Section D.1.8: Termination of encoding */
+
+  /* Find the e->c in the coding interval with the largest
+   * number of trailing zero bits */
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+    e->c = temp + 0x8000L;
+  else
+    e->c = temp;
+  /* Send remaining bytes to output */
+  e->c <<= e->ct;
+  if (e->c & 0xF8000000L) {
+    /* One final overflow has to be handled */
+    if (e->buffer >= 0) {
+      if (e->zc)
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
+      emit_byte(e->buffer + 1, cinfo);
+      if (e->buffer + 1 == 0xFF)
+        emit_byte(0x00, cinfo);
+    }
+    e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+    e->sc = 0;
+  } else {
+    if (e->buffer == 0)
+      ++e->zc;
+    else if (e->buffer >= 0) {
+      if (e->zc)
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
+      emit_byte(e->buffer, cinfo);
+    }
+    if (e->sc) {
+      if (e->zc)
+        do emit_byte(0x00, cinfo);
+        while (--e->zc);
+      do {
+        emit_byte(0xFF, cinfo);
+        emit_byte(0x00, cinfo);
+      } while (--e->sc);
+    }
+  }
+  /* Output final bytes only if they are not 0x00 */
+  if (e->c & 0x7FFF800L) {
+    if (e->zc)  /* output final pending zero bytes */
+      do emit_byte(0x00, cinfo);
+      while (--e->zc);
+    emit_byte((e->c >> 19) & 0xFF, cinfo);
+    if (((e->c >> 19) & 0xFF) == 0xFF)
+      emit_byte(0x00, cinfo);
+    if (e->c & 0x7F800L) {
+      emit_byte((e->c >> 11) & 0xFF, cinfo);
+      if (((e->c >> 11) & 0xFF) == 0xFF)
+        emit_byte(0x00, cinfo);
+    }
+  }
+}
+
+
+/*
+ * The core arithmetic encoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Parameter 'val' to be encoded may be 0 or 1 (binary decision).
+ *
+ * Note: I've added full "Pacman" termination support to the
+ * byte output routines, which is equivalent to the optional
+ * Discard_final_zeros procedure (Figure D.15) in the spec.
+ * Thus, we always produce the shortest possible output
+ * stream compliant to the spec (no trailing zero bytes,
+ * except for FF stuffing).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(void)
+arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv;
+
+  /* Fetch values from our compact representation of Table D.3(D.2):
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
+  e->a -= qe;
+  if (val != (sv >> 7)) {
+    /* Encode the less probable symbol */
+    if (e->a >= qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency, otherwise code the LPS
+       * as usual: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+  } else {
+    /* Encode the more probable symbol */
+    if (e->a >= 0x8000L)
+      return;  /* A >= 0x8000 -> ready, no renormalization required */
+    if (e->a < qe) {
+      /* If the interval size (qe) for the less probable symbol (LPS)
+       * is larger than the interval size for the MPS, then exchange
+       * the two symbols for coding efficiency: */
+      e->c += e->a;
+      e->a = qe;
+    }
+    *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+  }
+
+  /* Renormalization & data output per section D.1.6 */
+  do {
+    e->a <<= 1;
+    e->c <<= 1;
+    if (--e->ct == 0) {
+      /* Another byte is ready for output */
+      temp = e->c >> 19;
+      if (temp > 0xFF) {
+        /* Handle overflow over all stacked 0xFF bytes */
+        if (e->buffer >= 0) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          emit_byte(e->buffer + 1, cinfo);
+          if (e->buffer + 1 == 0xFF)
+            emit_byte(0x00, cinfo);
+        }
+        e->zc += e->sc;  /* carry-over converts stacked 0xFF bytes to 0x00 */
+        e->sc = 0;
+        /* Note: The 3 spacer bits in the C register guarantee
+         * that the new buffer byte can't be 0xFF here
+         * (see page 160 in the P&M JPEG book). */
+        e->buffer = temp & 0xFF;  /* new output byte, might overflow later */
+      } else if (temp == 0xFF) {
+        ++e->sc;  /* stack 0xFF byte (which might overflow later) */
+      } else {
+        /* Output all stacked 0xFF bytes, they will not overflow any more */
+        if (e->buffer == 0)
+          ++e->zc;
+        else if (e->buffer >= 0) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          emit_byte(e->buffer, cinfo);
+        }
+        if (e->sc) {
+          if (e->zc)
+            do emit_byte(0x00, cinfo);
+            while (--e->zc);
+          do {
+            emit_byte(0xFF, cinfo);
+            emit_byte(0x00, cinfo);
+          } while (--e->sc);
+        }
+        e->buffer = temp & 0xFF;  /* new output byte (can still overflow) */
+      }
+      e->c &= 0x7FFFFL;
+      e->ct += 8;
+    }
+  } while (e->a < 0x8000L);
+}
+
+
+/*
+ * Emit a restart marker & resynchronize predictions.
+ */
+
+LOCAL(void)
+emit_restart (j_compress_ptr cinfo, int restart_num)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  finish_pass(cinfo);
+
+  emit_byte(0xFF, cinfo);
+  emit_byte(JPEG_RST0 + restart_num, cinfo);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+}
+
+
+/*
+ * MCU encoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl;
+  int v, v2, m;
+  ISHIFT_TEMPS
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Compute the DC value after the required point transform by Al.
+     * This is simply an arithmetic right shift.
+     */
+    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = m - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = m;
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+        arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+        st += 2;			/* Table F.4: SP = S0 + 2 */
+        entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+        v = -v;
+        arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+        st += 3;			/* Table F.4: SN = S0 + 3 */
+        entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+        entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+        entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  ke = cinfo->Se;
+  do {
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+  } while (--ke);
+
+  /* Figure F.5: Encode_AC_Coefficients */
+  for (k = cinfo->Ss - 1; k < ke;) {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    arith_encode(cinfo, st, 0);		/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[++k]]) >= 0) {
+        if (v >>= cinfo->Al) {
+          arith_encode(cinfo, st + 1, 1);
+          arith_encode(cinfo, entropy->fixed_bin, 0);
+          break;
+        }
+      } else {
+        v = -v;
+        if (v >>= cinfo->Al) {
+          arith_encode(cinfo, st + 1, 1);
+          arith_encode(cinfo, entropy->fixed_bin, 1);
+          break;
+        }
+      }
+      arith_encode(cinfo, st + 1, 0);
+      st += 3;
+    }
+    st += 2;
+    /* Figure F.8: Encoding the magnitude category of v */
+    m = 0;
+    if (v -= 1) {
+      arith_encode(cinfo, st, 1);
+      m = 1;
+      v2 = v;
+      if (v2 >>= 1) {
+        arith_encode(cinfo, st, 1);
+        m <<= 1;
+        st = entropy->ac_stats[tbl] +
+             (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
+      }
+    }
+    arith_encode(cinfo, st, 0);
+    /* Figure F.9: Encoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      arith_encode(cinfo, st, (m & v) ? 1 : 0);
+  }
+  /* Encode EOB decision only if k < cinfo->Se */
+  if (k < cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int Al, blkn;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  Al = cinfo->Al;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* We simply emit the Al'th bit of the DC coefficient value. */
+    arith_encode(cinfo, st, (MCU_data[blkn][0][0] >> Al) & 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU encoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, k, ke, kex;
+  int v;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data block */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Section G.1.3.3: Encoding of AC coefficients */
+
+  /* Establish EOB (end-of-block) index */
+  ke = cinfo->Se;
+  do {
+    /* We must apply the point transform by Al.  For AC coefficients this
+     * is an integer division with rounding towards 0.  To do this portably
+     * in C, we shift after obtaining the absolute value.
+     */
+    if ((v = (*block)[natural_order[ke]]) >= 0) {
+      if (v >>= cinfo->Al) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Al) break;
+    }
+  } while (--ke);
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  for (kex = ke; kex > 0; kex--)
+    if ((v = (*block)[natural_order[kex]]) >= 0) {
+      if (v >>= cinfo->Ah) break;
+    } else {
+      v = -v;
+      if (v >>= cinfo->Ah) break;
+    }
+
+  /* Figure G.10: Encode_AC_Coefficients_SA */
+  for (k = cinfo->Ss - 1; k < ke;) {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    if (k >= kex)
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+    for (;;) {
+      if ((v = (*block)[natural_order[++k]]) >= 0) {
+        if (v >>= cinfo->Al) {
+          if (v >> 1)			/* previously nonzero coef */
+            arith_encode(cinfo, st + 2, (v & 1));
+          else {			/* newly nonzero coef */
+            arith_encode(cinfo, st + 1, 1);
+            arith_encode(cinfo, entropy->fixed_bin, 0);
+          }
+          break;
+        }
+      } else {
+        v = -v;
+        if (v >>= cinfo->Al) {
+          if (v >> 1)			/* previously nonzero coef */
+            arith_encode(cinfo, st + 2, (v & 1));
+          else {			/* newly nonzero coef */
+            arith_encode(cinfo, st + 1, 1);
+            arith_encode(cinfo, entropy->fixed_bin, 1);
+          }
+          break;
+        }
+      }
+      arith_encode(cinfo, st + 1, 0);
+      st += 3;
+    }
+  }
+  /* Encode EOB decision only if k < cinfo->Se */
+  if (k < cinfo->Se) {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    arith_encode(cinfo, st, 1);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Encode and output one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, k, ke;
+  int v, v2, m;
+  const int * natural_order;
+
+  /* Emit restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0) {
+      emit_restart(cinfo, entropy->next_restart_num);
+      entropy->restarts_to_go = cinfo->restart_interval;
+      entropy->next_restart_num++;
+      entropy->next_restart_num &= 7;
+    }
+    entropy->restarts_to_go--;
+  }
+
+  natural_order = cinfo->natural_order;
+
+  /* Encode the MCU data blocks */
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.4: Encode_DC_DIFF */
+    if ((v = (*block)[0] - entropy->last_dc_val[ci]) == 0) {
+      arith_encode(cinfo, st, 0);
+      entropy->dc_context[ci] = 0;	/* zero diff category */
+    } else {
+      entropy->last_dc_val[ci] = (*block)[0];
+      arith_encode(cinfo, st, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+        arith_encode(cinfo, st + 1, 0);	/* Table F.4: SS = S0 + 1 */
+        st += 2;			/* Table F.4: SP = S0 + 2 */
+        entropy->dc_context[ci] = 4;	/* small positive diff category */
+      } else {
+        v = -v;
+        arith_encode(cinfo, st + 1, 1);	/* Table F.4: SS = S0 + 1 */
+        st += 3;			/* Table F.4: SN = S0 + 3 */
+        entropy->dc_context[ci] = 8;	/* small negative diff category */
+      }
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        st = entropy->dc_stats[tbl] + 20; /* Table F.4: X1 = 20 */
+        while (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st += 1;
+        }
+      }
+      arith_encode(cinfo, st, 0);
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+        entropy->dc_context[ci] = 0;	/* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+        entropy->dc_context[ci] += 8;	/* large diff category */
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+
+    /* Sections F.1.4.2 & F.1.4.4.2: Encoding of AC coefficients */
+
+    if ((ke = cinfo->lim_Se) == 0) continue;
+    tbl = compptr->ac_tbl_no;
+
+    /* Establish EOB (end-of-block) index */
+    do {
+      if ((*block)[natural_order[ke]]) break;
+    } while (--ke);
+
+    /* Figure F.5: Encode_AC_Coefficients */
+    for (k = 0; k < ke;) {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      arith_encode(cinfo, st, 0);	/* EOB decision */
+      while ((v = (*block)[natural_order[++k]]) == 0) {
+        arith_encode(cinfo, st + 1, 0);
+        st += 3;
+      }
+      arith_encode(cinfo, st + 1, 1);
+      /* Figure F.6: Encoding nonzero value v */
+      /* Figure F.7: Encoding the sign of v */
+      if (v > 0) {
+        arith_encode(cinfo, entropy->fixed_bin, 0);
+      } else {
+        v = -v;
+        arith_encode(cinfo, entropy->fixed_bin, 1);
+      }
+      st += 2;
+      /* Figure F.8: Encoding the magnitude category of v */
+      m = 0;
+      if (v -= 1) {
+        arith_encode(cinfo, st, 1);
+        m = 1;
+        v2 = v;
+        if (v2 >>= 1) {
+          arith_encode(cinfo, st, 1);
+          m <<= 1;
+          st = entropy->ac_stats[tbl] +
+               (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+          while (v2 >>= 1) {
+            arith_encode(cinfo, st, 1);
+            m <<= 1;
+            st += 1;
+          }
+        }
+      }
+      arith_encode(cinfo, st, 0);
+      /* Figure F.9: Encoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        arith_encode(cinfo, st, (m & v) ? 1 : 0);
+    }
+    /* Encode EOB decision only if k < cinfo->lim_Se */
+    if (k < cinfo->lim_Se) {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      arith_encode(cinfo, st, 1);
+    }
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (gather_statistics)
+    /* Make sure to avoid that in the master control logic!
+     * We are fully adaptive here and need no extra
+     * statistics gathering pass!
+     */
+    ERREXIT(cinfo, JERR_NOT_COMPILED);
+
+  /* We assume jcmaster.c already validated the progressive scan parameters. */
+
+  /* Select execution routines */
+  if (cinfo->progressive_mode) {
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+        entropy->pub.encode_mcu = encode_mcu_DC_first;
+      else
+        entropy->pub.encode_mcu = encode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+        entropy->pub.encode_mcu = encode_mcu_DC_refine;
+      else
+        entropy->pub.encode_mcu = encode_mcu_AC_refine;
+    }
+  } else
+    entropy->pub.encode_mcu = encode_mcu;
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    /* AC needs no table when not present */
+    if (cinfo->Se) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+#ifdef CALCULATE_SPECTRAL_CONDITIONING
+      if (cinfo->progressive_mode)
+        /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
+        cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+#endif
+    }
+  }
+
+  /* Initialize arithmetic encoding variables */
+  entropy->c = 0;
+  entropy->a = 0x10000L;
+  entropy->sc = 0;
+  entropy->zc = 0;
+  entropy->ct = 11;
+  entropy->buffer = -1;  /* empty */
+
+  /* Initialize restart stuff */
+  entropy->restarts_to_go = cinfo->restart_interval;
+  entropy->next_restart_num = 0;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy encoding.
+ */
+
+GLOBAL(void)
+jinit_arith_encoder (j_compress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                SIZEOF(arith_entropy_encoder));
+  cinfo->entropy = &entropy->pub;
+  entropy->pub.start_pass = start_pass;
+  entropy->pub.finish_pass = finish_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+}
--- a/3rdparty/libjpeg/jccoefct.c
+++ b/3rdparty/libjpeg/jccoefct.c
@@ -2,6 +2,7 @@
 * jccoefct.c
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2003-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -149,6 +150,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  int blkn, bi, ci, yindex, yoffset, blockcnt;
  JDIMENSION ypos, xpos;
  jpeg_component_info *compptr;
+  forward_DCT_ptr forward_DCT;

  /* Loop to write as much as one whole iMCU row */
  for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
@@ -167,20 +169,22 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
      blkn = 0;
      for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
        compptr = cinfo->cur_comp_info[ci];
+        forward_DCT = cinfo->fdct->forward_DCT[compptr->component_index];
        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
                                                : compptr->last_col_width;
        xpos = MCU_col_num * compptr->MCU_sample_width;
-    ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
+        ypos = yoffset * compptr->DCT_v_scaled_size;
+        /* ypos == (yoffset+yindex) * DCTSIZE */
        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
          if (coef->iMCU_row_num < last_iMCU_row ||
              yoffset+yindex < compptr->last_row_height) {
-        (*cinfo->fdct->forward_DCT) (cinfo, compptr,
+            (*forward_DCT) (cinfo, compptr,
                            input_buf[compptr->component_index],
                            coef->MCU_buffer[blkn],
                            ypos, xpos, (JDIMENSION) blockcnt);
            if (blockcnt < compptr->MCU_width) {
              /* Create some dummy blocks at the right edge of the image. */
-          jzero_far((void FAR *) coef->MCU_buffer[blkn + blockcnt],
+              FMEMZERO((void FAR *) coef->MCU_buffer[blkn + blockcnt],
                       (compptr->MCU_width - blockcnt) * SIZEOF(JBLOCK));
              for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
                coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
@@ -188,14 +192,14 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
            }
          } else {
            /* Create a row of dummy blocks at the bottom of the image. */
-        jzero_far((void FAR *) coef->MCU_buffer[blkn],
+            FMEMZERO((void FAR *) coef->MCU_buffer[blkn],
                     compptr->MCU_width * SIZEOF(JBLOCK));
            for (bi = 0; bi < compptr->MCU_width; bi++) {
              coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
            }
          }
          blkn += compptr->MCU_width;
-      ypos += DCTSIZE;
+          ypos += compptr->DCT_v_scaled_size;
        }
      }
      /* Try to write the MCU.  In event of a suspension failure, we will
@@ -252,6 +256,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  jpeg_component_info *compptr;
  JBLOCKARRAY buffer;
  JBLOCKROW thisblockrow, lastblockrow;
+  forward_DCT_ptr forward_DCT;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
@@ -274,19 +279,19 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
    ndummy = (int) (blocks_across % h_samp_factor);
    if (ndummy > 0)
      ndummy = h_samp_factor - ndummy;
+    forward_DCT = cinfo->fdct->forward_DCT[ci];
    /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
     * on forward_DCT processes a complete horizontal row of DCT blocks.
     */
    for (block_row = 0; block_row < block_rows; block_row++) {
      thisblockrow = buffer[block_row];
-      (*cinfo->fdct->forward_DCT) (cinfo, compptr,
-                   input_buf[ci], thisblockrow,
-                   (JDIMENSION) (block_row * DCTSIZE),
+      (*forward_DCT) (cinfo, compptr, input_buf[ci], thisblockrow,
+                      (JDIMENSION) (block_row * compptr->DCT_v_scaled_size),
                      (JDIMENSION) 0, blocks_across);
      if (ndummy > 0) {
        /* Create dummy blocks at the right edge of the image. */
        thisblockrow += blocks_across; /* => first dummy block */
-    jzero_far((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
+        FMEMZERO((void FAR *) thisblockrow, ndummy * SIZEOF(JBLOCK));
        lastDC = thisblockrow[-1][0];
        for (bi = 0; bi < ndummy; bi++) {
          thisblockrow[bi][0] = lastDC;
@@ -305,7 +310,7 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
           block_row++) {
        thisblockrow = buffer[block_row];
        lastblockrow = buffer[block_row-1];
-    jzero_far((void FAR *) thisblockrow,
+        FMEMZERO((void FAR *) thisblockrow,
                 (size_t) (blocks_across * SIZEOF(JBLOCK)));
        for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
          lastDC = lastblockrow[h_samp_factor-1][0];
--- a/3rdparty/libjpeg/jccolor.c
+++ b/3rdparty/libjpeg/jccolor.c
@@ -2,6 +2,7 @@
 * jccolor.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -132,8 +133,8 @@ rgb_ycc_convert (j_compress_ptr cinfo,
                 JDIMENSION output_row, int num_rows)
 {
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register int r, g, b;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2;
  register JDIMENSION col;
@@ -149,7 +150,6 @@ rgb_ycc_convert (j_compress_ptr cinfo,
      r = GETJSAMPLE(inptr[RGB_RED]);
      g = GETJSAMPLE(inptr[RGB_GREEN]);
      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
       * must be too; we do not need an explicit range-limiting operation.
       * Hence the value being shifted is never negative, and we don't
@@ -167,6 +167,7 @@ rgb_ycc_convert (j_compress_ptr cinfo,
      outptr2[col] = (JSAMPLE)
                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
                 >> SCALEBITS);
+      inptr += RGB_PIXELSIZE;
    }
  }
 }
@@ -188,8 +189,8 @@ rgb_gray_convert (j_compress_ptr cinfo,
                  JDIMENSION output_row, int num_rows)
 {
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register int r, g, b;
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
@@ -197,17 +198,16 @@ rgb_gray_convert (j_compress_ptr cinfo,

  while (--num_rows >= 0) {
    inptr = *input_buf++;
-    outptr = output_buf[0][output_row];
-    output_row++;
+    outptr = output_buf[0][output_row++];
    for (col = 0; col < num_cols; col++) {
      r = GETJSAMPLE(inptr[RGB_RED]);
      g = GETJSAMPLE(inptr[RGB_GREEN]);
      b = GETJSAMPLE(inptr[RGB_BLUE]);
-      inptr += RGB_PIXELSIZE;
      /* Y */
      outptr[col] = (JSAMPLE)
                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
                 >> SCALEBITS);
+      inptr += RGB_PIXELSIZE;
    }
  }
 }
@@ -227,8 +227,8 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
                   JDIMENSION output_row, int num_rows)
 {
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
-  register int r, g, b;
  register INT32 * ctab = cconvert->rgb_ycc_tab;
+  register int r, g, b;
  register JSAMPROW inptr;
  register JSAMPROW outptr0, outptr1, outptr2, outptr3;
  register JDIMENSION col;
@@ -247,7 +247,6 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
      /* K passes through as-is */
      outptr3[col] = inptr[3];	/* don't need GETJSAMPLE here */
-      inptr += 4;
      /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
       * must be too; we do not need an explicit range-limiting operation.
       * Hence the value being shifted is never negative, and we don't
@@ -265,6 +264,46 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
      outptr2[col] = (JSAMPLE)
                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
                 >> SCALEBITS);
+      inptr += 4;
+    }
+  }
+}
+
+
+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * [R,G,B] to [R-G,G,B-G] conversion with modulo calculation
+ * (forward reversible color transform).
+ */
+
+METHODDEF(void)
+rgb_rgb1_convert (j_compress_ptr cinfo,
+                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+                  JDIMENSION output_row, int num_rows)
+{
+  register int r, g, b;
+  register JSAMPROW inptr;
+  register JSAMPROW outptr0, outptr1, outptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->image_width;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr[RGB_RED]);
+      g = GETJSAMPLE(inptr[RGB_GREEN]);
+      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      /* Assume that MAXJSAMPLE+1 is a power of 2, so that the MOD
+       * (modulo) operator is equivalent to the bitmask operator AND.
+       */
+      outptr0[col] = (JSAMPLE) ((r - g + CENTERJSAMPLE) & MAXJSAMPLE);
+      outptr1[col] = (JSAMPLE) g;
+      outptr2[col] = (JSAMPLE) ((b - g + CENTERJSAMPLE) & MAXJSAMPLE);
+      inptr += RGB_PIXELSIZE;
    }
  }
 }
@@ -281,16 +320,15 @@ grayscale_convert (j_compress_ptr cinfo,
                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
                   JDIMENSION output_row, int num_rows)
 {
+  int instride = cinfo->input_components;
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->image_width;
-  int instride = cinfo->input_components;

  while (--num_rows >= 0) {
    inptr = *input_buf++;
-    outptr = output_buf[0][output_row];
-    output_row++;
+    outptr = output_buf[0][output_row++];
    for (col = 0; col < num_cols; col++) {
      outptr[col] = inptr[0];	/* don't need GETJSAMPLE() here */
      inptr += instride;
@@ -299,6 +337,39 @@ grayscale_convert (j_compress_ptr cinfo,
 }


+/*
+ * Convert some rows of samples to the JPEG colorspace.
+ * No colorspace conversion, but change from interleaved
+ * to separate-planes representation.
+ */
+
+METHODDEF(void)
+rgb_convert (j_compress_ptr cinfo,
+             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows)
+{
+  register JSAMPROW inptr;
+  register JSAMPROW outptr0, outptr1, outptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->image_width;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
+      inptr += RGB_PIXELSIZE;
+    }
+  }
+}
+
+
 /*
 * Convert some rows of samples to the JPEG colorspace.
 * This version handles multi-component colorspaces without conversion.
@@ -310,20 +381,20 @@ null_convert (j_compress_ptr cinfo,
              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
              JDIMENSION output_row, int num_rows)
 {
+  int ci;
+  register int nc = cinfo->num_components;
  register JSAMPROW inptr;
  register JSAMPROW outptr;
  register JDIMENSION col;
-  register int ci;
-  int nc = cinfo->num_components;
  JDIMENSION num_cols = cinfo->image_width;

  while (--num_rows >= 0) {
    /* It seems fastest to make a separate pass for each component. */
    for (ci = 0; ci < nc; ci++) {
-      inptr = *input_buf;
+      inptr = input_buf[0] + ci;
      outptr = output_buf[ci][output_row];
      for (col = 0; col < num_cols; col++) {
-    outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+        *outptr++ = *inptr;	/* don't need GETJSAMPLE() here */
        inptr += nc;
      }
    }
@@ -356,7 +427,7 @@ jinit_color_converter (j_compress_ptr cinfo)
  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_color_converter));
-  cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
+  cinfo->cconvert = &cconvert->pub;
  /* set start_pass to null method until we find out differently */
  cconvert->pub.start_pass = null_method;

@@ -368,11 +439,9 @@ jinit_color_converter (j_compress_ptr cinfo)
    break;

  case JCS_RGB:
-#if RGB_PIXELSIZE != 3
    if (cinfo->input_components != RGB_PIXELSIZE)
      ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
    break;
-#endif /* else share code with YCbCr */

  case JCS_YCbCr:
    if (cinfo->input_components != 3)
@@ -391,28 +460,41 @@ jinit_color_converter (j_compress_ptr cinfo)
    break;
  }

+  /* Support color transform only for RGB colorspace */
+  if (cinfo->color_transform && cinfo->jpeg_color_space != JCS_RGB)
+    ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+
  /* Check num_components, set conversion method based on requested space */
  switch (cinfo->jpeg_color_space) {
  case JCS_GRAYSCALE:
    if (cinfo->num_components != 1)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_GRAYSCALE)
+    if (cinfo->in_color_space == JCS_GRAYSCALE ||
+        cinfo->in_color_space == JCS_YCbCr)
      cconvert->pub.color_convert = grayscale_convert;
    else if (cinfo->in_color_space == JCS_RGB) {
      cconvert->pub.start_pass = rgb_ycc_start;
      cconvert->pub.color_convert = rgb_gray_convert;
-    } else if (cinfo->in_color_space == JCS_YCbCr)
-      cconvert->pub.color_convert = grayscale_convert;
-    else
+    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_RGB:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
-    if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
-      cconvert->pub.color_convert = null_convert;
-    else
+    if (cinfo->in_color_space == JCS_RGB) {
+      switch (cinfo->color_transform) {
+      case JCT_NONE:
+        cconvert->pub.color_convert = rgb_convert;
+        break;
+      case JCT_SUBTRACT_GREEN:
+        cconvert->pub.color_convert = rgb_rgb1_convert;
+        break;
+      default:
+        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+        break;
+      }
+    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

--- a/3rdparty/libjpeg/jcdctmgr.c
+++ b/3rdparty/libjpeg/jcdctmgr.c
@@ -23,7 +23,7 @@ typedef struct {
  struct jpeg_forward_dct pub;	/* public fields */

  /* Pointer to the DCT routine actually in use */
-  forward_DCT_method_ptr do_dct;
+  forward_DCT_method_ptr do_dct[MAX_COMPONENTS];

  /* The actual post-DCT divisors --- not identical to the quant table
   * entries, because of scaling (especially for an unnormalized DCT).
@@ -33,7 +33,7 @@ typedef struct {

 #ifdef DCT_FLOAT_SUPPORTED
  /* Same as above for the floating-point case. */
-  float_DCT_method_ptr do_float_dct;
+  float_DCT_method_ptr do_float_dct[MAX_COMPONENTS];
  FAST_FLOAT * float_divisors[NUM_QUANT_TBLS];
 #endif
 } my_fdct_controller;
@@ -41,131 +41,16 @@ typedef struct {
 typedef my_fdct_controller * my_fdct_ptr;


-/*
- * Initialize for a processing pass.
- * Verify that all referenced Q-tables are present, and set up
- * the divisor table for each one.
- * In the current implementation, DCT of all components is done during
- * the first pass, even if only some components will be output in the
- * first scan.  Hence all components should be examined here.
+/* The current scaled-DCT routines require ISLOW-style divisor tables,
+ * so be sure to compile that code if either ISLOW or SCALING is requested.
 */
-
-METHODDEF(void)
-start_pass_fdctmgr (j_compress_ptr cinfo)
-{
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  int ci, qtblno, i;
-  jpeg_component_info *compptr;
-  JQUANT_TBL * qtbl;
-  DCTELEM * dtbl;
-
-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
-    qtblno = compptr->quant_tbl_no;
-    /* Make sure specified quantization table is present */
-    if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
-    cinfo->quant_tbl_ptrs[qtblno] == NULL)
-      ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
-    qtbl = cinfo->quant_tbl_ptrs[qtblno];
-    /* Compute divisors for this quant table */
-    /* We may do this more than once for same table, but it's not a big deal */
-    switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
-    case JDCT_ISLOW:
-      /* For LL&M IDCT method, divisors are equal to raw quantization
-       * coefficients multiplied by 8 (to counteract scaling).
-       */
-      if (fdct->divisors[qtblno] == NULL) {
-    fdct->divisors[qtblno] = (DCTELEM *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                      DCTSIZE2 * SIZEOF(DCTELEM));
-      }
-      dtbl = fdct->divisors[qtblno];
-      for (i = 0; i < DCTSIZE2; i++) {
-    dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
-      }
-      break;
+#define PROVIDE_ISLOW_TABLES
+#else
+#ifdef DCT_SCALING_SUPPORTED
+#define PROVIDE_ISLOW_TABLES
 #endif
-#ifdef DCT_IFAST_SUPPORTED
-    case JDCT_IFAST:
-      {
-    /* For AA&N IDCT method, divisors are equal to quantization
-     * coefficients scaled by scalefactor[row]*scalefactor[col], where
-     *   scalefactor[0] = 1
-     *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-     * We apply a further scale factor of 8.
-     */
-#define CONST_BITS 14
-    static const INT16 aanscales[DCTSIZE2] = {
-      /* precomputed values scaled up by 14 bits */
-      16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-      22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
-      21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
-      19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
-      16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
-      12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
-       8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
-       4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
-    };
-    SHIFT_TEMPS
-
-    if (fdct->divisors[qtblno] == NULL) {
-      fdct->divisors[qtblno] = (DCTELEM *)
-        (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                    DCTSIZE2 * SIZEOF(DCTELEM));
-    }
-    dtbl = fdct->divisors[qtblno];
-    for (i = 0; i < DCTSIZE2; i++) {
-      dtbl[i] = (DCTELEM)
-        DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
-                  (INT32) aanscales[i]),
-            CONST_BITS-3);
-    }
-      }
-      break;
 #endif
-#ifdef DCT_FLOAT_SUPPORTED
-    case JDCT_FLOAT:
-      {
-    /* For float AA&N IDCT method, divisors are equal to quantization
-     * coefficients scaled by scalefactor[row]*scalefactor[col], where
-     *   scalefactor[0] = 1
-     *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
-     * We apply a further scale factor of 8.
-     * What's actually stored is 1/divisor so that the inner loop can
-     * use a multiplication rather than a division.
-     */
-    FAST_FLOAT * fdtbl;
-    int row, col;
-    static const double aanscalefactor[DCTSIZE] = {
-      1.0, 1.387039845, 1.306562965, 1.175875602,
-      1.0, 0.785694958, 0.541196100, 0.275899379
-    };
-
-    if (fdct->float_divisors[qtblno] == NULL) {
-      fdct->float_divisors[qtblno] = (FAST_FLOAT *)
-        (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                    DCTSIZE2 * SIZEOF(FAST_FLOAT));
-    }
-    fdtbl = fdct->float_divisors[qtblno];
-    i = 0;
-    for (row = 0; row < DCTSIZE; row++) {
-      for (col = 0; col < DCTSIZE; col++) {
-        fdtbl[i] = (FAST_FLOAT)
-          (1.0 / (((double) qtbl->quantval[i] *
-               aanscalefactor[row] * aanscalefactor[col] * 8.0)));
-        i++;
-      }
-    }
-      }
-      break;
-#endif
-    default:
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-      break;
-    }
-  }
-}


 /*
@@ -185,43 +70,16 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  forward_DCT_method_ptr do_dct = fdct->do_dct;
+  forward_DCT_method_ptr do_dct = fdct->do_dct[compptr->component_index];
  DCTELEM * divisors = fdct->divisors[compptr->quant_tbl_no];
  DCTELEM workspace[DCTSIZE2];	/* work area for FDCT subroutine */
  JDIMENSION bi;

  sample_data += start_row;	/* fold in the vertical offset once */

-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register DCTELEM *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-    elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-#else
-    { register int elemc;
-      for (elemc = DCTSIZE; elemc > 0; elemc--) {
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-      }
-    }
-#endif
-      }
-    }
-
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
    /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*do_dct) (workspace, sample_data, start_col);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
    { register DCTELEM temp, qval;
@@ -275,44 +133,16 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  /* This routine is heavily used, so it's worth coding it tightly. */
  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
-  float_DCT_method_ptr do_dct = fdct->do_float_dct;
+  float_DCT_method_ptr do_dct = fdct->do_float_dct[compptr->component_index];
  FAST_FLOAT * divisors = fdct->float_divisors[compptr->quant_tbl_no];
  FAST_FLOAT workspace[DCTSIZE2]; /* work area for FDCT subroutine */
  JDIMENSION bi;

  sample_data += start_row;	/* fold in the vertical offset once */

-  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
-    /* Load data into workspace, applying unsigned->signed conversion */
-    { register FAST_FLOAT *workspaceptr;
-      register JSAMPROW elemptr;
-      register int elemr;
-
-      workspaceptr = workspace;
-      for (elemr = 0; elemr < DCTSIZE; elemr++) {
-    elemptr = sample_data[elemr] + start_col;
-#if DCTSIZE == 8		/* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-#else
-    { register int elemc;
-      for (elemc = DCTSIZE; elemc > 0; elemc--) {
-        *workspaceptr++ = (FAST_FLOAT)
-          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-      }
-    }
-#endif
-      }
-    }
-
+  for (bi = 0; bi < num_blocks; bi++, start_col += compptr->DCT_h_scaled_size) {
    /* Perform the DCT */
-    (*do_dct) (workspace);
+    (*do_dct) (workspace, sample_data, start_col);

    /* Quantize/descale the coefficients, and store into coef_blocks[] */
    { register FAST_FLOAT temp;
@@ -337,6 +167,295 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info * compptr,
 #endif /* DCT_FLOAT_SUPPORTED */


+/*
+ * Initialize for a processing pass.
+ * Verify that all referenced Q-tables are present, and set up
+ * the divisor table for each one.
+ * In the current implementation, DCT of all components is done during
+ * the first pass, even if only some components will be output in the
+ * first scan.  Hence all components should be examined here.
+ */
+
+METHODDEF(void)
+start_pass_fdctmgr (j_compress_ptr cinfo)
+{
+  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  int ci, qtblno, i;
+  jpeg_component_info *compptr;
+  int method = 0;
+  JQUANT_TBL * qtbl;
+  DCTELEM * dtbl;
+
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    /* Select the proper DCT routine for this component's scaling */
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
+#ifdef DCT_SCALING_SUPPORTED
+    case ((1 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_1x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_2x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_3x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_4x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_5x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_6x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_7x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      fdct->do_dct[ci] = jpeg_fdct_9x9;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_10x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      fdct->do_dct[ci] = jpeg_fdct_11x11;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_12x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      fdct->do_dct[ci] = jpeg_fdct_13x13;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_14x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      fdct->do_dct[ci] = jpeg_fdct_15x15;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_16x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_16x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      fdct->do_dct[ci] = jpeg_fdct_14x7;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_12x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      fdct->do_dct[ci] = jpeg_fdct_10x5;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_8x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      fdct->do_dct[ci] = jpeg_fdct_6x3;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_4x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      fdct->do_dct[ci] = jpeg_fdct_2x1;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      fdct->do_dct[ci] = jpeg_fdct_8x16;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      fdct->do_dct[ci] = jpeg_fdct_7x14;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      fdct->do_dct[ci] = jpeg_fdct_6x12;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      fdct->do_dct[ci] = jpeg_fdct_5x10;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      fdct->do_dct[ci] = jpeg_fdct_4x8;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      fdct->do_dct[ci] = jpeg_fdct_3x6;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      fdct->do_dct[ci] = jpeg_fdct_2x4;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      fdct->do_dct[ci] = jpeg_fdct_1x2;
+      method = JDCT_ISLOW;	/* jfdctint uses islow-style table */
+      break;
+#endif
+    case ((DCTSIZE << 8) + DCTSIZE):
+      switch (cinfo->dct_method) {
+#ifdef DCT_ISLOW_SUPPORTED
+      case JDCT_ISLOW:
+        fdct->do_dct[ci] = jpeg_fdct_islow;
+        method = JDCT_ISLOW;
+        break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+      case JDCT_IFAST:
+        fdct->do_dct[ci] = jpeg_fdct_ifast;
+        method = JDCT_IFAST;
+        break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+      case JDCT_FLOAT:
+        fdct->do_float_dct[ci] = jpeg_fdct_float;
+        method = JDCT_FLOAT;
+        break;
+#endif
+      default:
+        ERREXIT(cinfo, JERR_NOT_COMPILED);
+        break;
+      }
+      break;
+    default:
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+               compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
+      break;
+    }
+    qtblno = compptr->quant_tbl_no;
+    /* Make sure specified quantization table is present */
+    if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
+        cinfo->quant_tbl_ptrs[qtblno] == NULL)
+      ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
+    qtbl = cinfo->quant_tbl_ptrs[qtblno];
+    /* Compute divisors for this quant table */
+    /* We may do this more than once for same table, but it's not a big deal */
+    switch (method) {
+#ifdef PROVIDE_ISLOW_TABLES
+    case JDCT_ISLOW:
+      /* For LL&M IDCT method, divisors are equal to raw quantization
+       * coefficients multiplied by 8 (to counteract scaling).
+       */
+      if (fdct->divisors[qtblno] == NULL) {
+        fdct->divisors[qtblno] = (DCTELEM *)
+          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                      DCTSIZE2 * SIZEOF(DCTELEM));
+      }
+      dtbl = fdct->divisors[qtblno];
+      for (i = 0; i < DCTSIZE2; i++) {
+        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+      }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
+      break;
+#endif
+#ifdef DCT_IFAST_SUPPORTED
+    case JDCT_IFAST:
+      {
+        /* For AA&N IDCT method, divisors are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * We apply a further scale factor of 8.
+         */
+#define CONST_BITS 14
+        static const INT16 aanscales[DCTSIZE2] = {
+          /* precomputed values scaled up by 14 bits */
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
+          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
+          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
+          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
+          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
+           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
+           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
+        };
+        SHIFT_TEMPS
+
+        if (fdct->divisors[qtblno] == NULL) {
+          fdct->divisors[qtblno] = (DCTELEM *)
+            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                        DCTSIZE2 * SIZEOF(DCTELEM));
+        }
+        dtbl = fdct->divisors[qtblno];
+        for (i = 0; i < DCTSIZE2; i++) {
+          dtbl[i] = (DCTELEM)
+            DESCALE(MULTIPLY16V16((INT32) qtbl->quantval[i],
+                                  (INT32) aanscales[i]),
+                    CONST_BITS-3);
+        }
+      }
+      fdct->pub.forward_DCT[ci] = forward_DCT;
+      break;
+#endif
+#ifdef DCT_FLOAT_SUPPORTED
+    case JDCT_FLOAT:
+      {
+        /* For float AA&N IDCT method, divisors are equal to quantization
+         * coefficients scaled by scalefactor[row]*scalefactor[col], where
+         *   scalefactor[0] = 1
+         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * We apply a further scale factor of 8.
+         * What's actually stored is 1/divisor so that the inner loop can
+         * use a multiplication rather than a division.
+         */
+        FAST_FLOAT * fdtbl;
+        int row, col;
+        static const double aanscalefactor[DCTSIZE] = {
+          1.0, 1.387039845, 1.306562965, 1.175875602,
+          1.0, 0.785694958, 0.541196100, 0.275899379
+        };
+
+        if (fdct->float_divisors[qtblno] == NULL) {
+          fdct->float_divisors[qtblno] = (FAST_FLOAT *)
+            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                        DCTSIZE2 * SIZEOF(FAST_FLOAT));
+        }
+        fdtbl = fdct->float_divisors[qtblno];
+        i = 0;
+        for (row = 0; row < DCTSIZE; row++) {
+          for (col = 0; col < DCTSIZE; col++) {
+            fdtbl[i] = (FAST_FLOAT)
+              (1.0 / (((double) qtbl->quantval[i] *
+                       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
+            i++;
+          }
+        }
+      }
+      fdct->pub.forward_DCT[ci] = forward_DCT_float;
+      break;
+#endif
+    default:
+      ERREXIT(cinfo, JERR_NOT_COMPILED);
+      break;
+    }
+  }
+}
+
+
 /*
 * Initialize FDCT manager.
 */
@@ -353,30 +472,6 @@ jinit_forward_dct (j_compress_ptr cinfo)
  cinfo->fdct = (struct jpeg_forward_dct *) fdct;
  fdct->pub.start_pass = start_pass_fdctmgr;

-  switch (cinfo->dct_method) {
-#ifdef DCT_ISLOW_SUPPORTED
-  case JDCT_ISLOW:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_islow;
-    break;
-#endif
-#ifdef DCT_IFAST_SUPPORTED
-  case JDCT_IFAST:
-    fdct->pub.forward_DCT = forward_DCT;
-    fdct->do_dct = jpeg_fdct_ifast;
-    break;
-#endif
-#ifdef DCT_FLOAT_SUPPORTED
-  case JDCT_FLOAT:
-    fdct->pub.forward_DCT = forward_DCT_float;
-    fdct->do_float_dct = jpeg_fdct_float;
-    break;
-#endif
-  default:
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
-    break;
-  }
-
  /* Mark divisor tables unallocated */
  for (i = 0; i < NUM_QUANT_TBLS; i++) {
    fdct->divisors[i] = NULL;
--- a/3rdparty/libjpeg/jchuff.c
+++ b/3rdparty/libjpeg/jchuff.c
--- a/3rdparty/libjpeg/jchuff.h
+++ b/3rdparty/libjpeg/jchuff.h
@@ -1,47 +0,0 @@
-/*
- * jchuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy encoding routines
- * that are shared between the sequential encoder (jchuff.c) and the
- * progressive encoder (jcphuff.c).  No other modules need to see these.
- */
-
-/* The legal range of a DCT coefficient is
- *  -1024 .. +1023  for 8-bit data;
- * -16384 .. +16383 for 12-bit data.
- * Hence the magnitude should always fit in 10 or 14 bits respectively.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
-#else
-#define MAX_COEF_BITS 14
-#endif
-
-/* Derived data constructed for each Huffman table */
-
-typedef struct {
-  unsigned int ehufco[256];	/* code for each symbol */
-  char ehufsi[256];		/* length of code for each symbol */
-  /* If no code has been allocated for a symbol S, ehufsi[S] contains 0 */
-} c_derived_tbl;
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_c_derived_tbl	jMkCDerived
-#define jpeg_gen_optimal_table	jGenOptTbl
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-    JPP((j_compress_ptr cinfo, boolean isDC, int tblno,
-         c_derived_tbl ** pdtbl));
-
-/* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-    JPP((j_compress_ptr cinfo, JHUFF_TBL * htbl, long freq[]));
--- a/3rdparty/libjpeg/jcinit.c
+++ b/3rdparty/libjpeg/jcinit.c
@@ -41,16 +41,9 @@ jinit_compress_master (j_compress_ptr cinfo)
  /* Forward DCT */
  jinit_forward_dct(cinfo);
  /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
    jinit_huff_encoder(cinfo);
  }

--- a/3rdparty/libjpeg/jcmainct.c
+++ b/3rdparty/libjpeg/jcmainct.c
@@ -2,11 +2,12 @@
 * jcmainct.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
- * This file contains the main_ptr buffer controller for compression.
- * The main_ptr buffer lies between the pre-processor and the JPEG
+ * This file contains the main buffer controller for compression.
+ * The main buffer lies between the pre-processor and the JPEG
 * compressor proper; it holds downsampled data in the JPEG colorspace.
 */

@@ -68,32 +69,32 @@ METHODDEF(void) process_data_buffer_main
 METHODDEF(void)
 start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;

  /* Do nothing in raw-data mode. */
  if (cinfo->raw_data_in)
    return;

-  main_ptr->cur_iMCU_row = 0;	/* initialize counters */
-  main_ptr->rowgroup_ctr = 0;
-  main_ptr->suspended = FALSE;
-  main_ptr->pass_mode = pass_mode;	/* save mode for use by process_data */
+  mainp->cur_iMCU_row = 0;	/* initialize counters */
+  mainp->rowgroup_ctr = 0;
+  mainp->suspended = FALSE;
+  mainp->pass_mode = pass_mode;	/* save mode for use by process_data */

  switch (pass_mode) {
  case JBUF_PASS_THRU:
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    if (main_ptr->whole_image[0] != NULL)
+    if (mainp->whole_image[0] != NULL)
      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
-    main_ptr->pub.process_data = process_data_simple_main;
+    mainp->pub.process_data = process_data_simple_main;
    break;
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
  case JBUF_SAVE_SOURCE:
  case JBUF_CRANK_DEST:
  case JBUF_SAVE_AND_PASS:
-    if (main_ptr->whole_image[0] == NULL)
+    if (mainp->whole_image[0] == NULL)
      ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
-    main_ptr->pub.process_data = process_data_buffer_main;
+    mainp->pub.process_data = process_data_buffer_main;
    break;
 #endif
  default:
@@ -114,46 +115,46 @@ process_data_simple_main (j_compress_ptr cinfo,
                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
                          JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;

-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
-    /* Read input data if we haven't filled the main_ptr buffer yet */
-    if (main_ptr->rowgroup_ctr < DCTSIZE)
+  while (mainp->cur_iMCU_row < cinfo->total_iMCU_rows) {
+    /* Read input data if we haven't filled the main buffer yet */
+    if (mainp->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
      (*cinfo->prep->pre_process_data) (cinfo,
                                        input_buf, in_row_ctr, in_rows_avail,
-                    main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                    (JDIMENSION) DCTSIZE);
+                                        mainp->buffer, &mainp->rowgroup_ctr,
+                                        (JDIMENSION) cinfo->min_DCT_v_scaled_size);

    /* If we don't have a full iMCU row buffered, return to application for
     * more data.  Note that preprocessor will always pad to fill the iMCU row
     * at the bottom of the image.
     */
-    if (main_ptr->rowgroup_ctr != DCTSIZE)
+    if (mainp->rowgroup_ctr != (JDIMENSION) cinfo->min_DCT_v_scaled_size)
      return;

    /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (! (*cinfo->coef->compress_data) (cinfo, mainp->buffer)) {
      /* If compressor did not consume the whole row, then we must need to
       * suspend processing and return to the application.  In this situation
       * we pretend we didn't yet consume the last input row; otherwise, if
       * it happened to be the last row of the image, the application would
       * think we were done.
       */
-      if (! main_ptr->suspended) {
+      if (! mainp->suspended) {
        (*in_row_ctr)--;
-    main_ptr->suspended = TRUE;
+        mainp->suspended = TRUE;
      }
      return;
    }
    /* We did finish the row.  Undo our little suspension hack if a previous
-     * call suspended; then mark the main_ptr buffer empty.
+     * call suspended; then mark the main buffer empty.
     */
-    if (main_ptr->suspended) {
+    if (mainp->suspended) {
      (*in_row_ctr)++;
-      main_ptr->suspended = FALSE;
+      mainp->suspended = FALSE;
    }
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
+    mainp->rowgroup_ctr = 0;
+    mainp->cur_iMCU_row++;
  }
 }

@@ -170,25 +171,27 @@ process_data_buffer_main (j_compress_ptr cinfo,
                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
                          JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  int ci;
  jpeg_component_info *compptr;
-  boolean writing = (main_ptr->pass_mode != JBUF_CRANK_DEST);
+  boolean writing = (mainp->pass_mode != JBUF_CRANK_DEST);

-  while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
+  while (mainp->cur_iMCU_row < cinfo->total_iMCU_rows) {
    /* Realign the virtual buffers if at the start of an iMCU row. */
-    if (main_ptr->rowgroup_ctr == 0) {
+    if (mainp->rowgroup_ctr == 0) {
      for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
           ci++, compptr++) {
-    main_ptr->buffer[ci] = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr) cinfo, main_ptr->whole_image[ci],
-       main_ptr->cur_iMCU_row * (compptr->v_samp_factor * DCTSIZE),
-       (JDIMENSION) (compptr->v_samp_factor * DCTSIZE), writing);
+        mainp->buffer[ci] = (*cinfo->mem->access_virt_sarray)
+          ((j_common_ptr) cinfo, mainp->whole_image[ci], mainp->cur_iMCU_row *
+           ((JDIMENSION) (compptr->v_samp_factor * cinfo->min_DCT_v_scaled_size)),
+           (JDIMENSION) (compptr->v_samp_factor * cinfo->min_DCT_v_scaled_size),
+           writing);
      }
      /* In a read pass, pretend we just read some source data. */
      if (! writing) {
-    *in_row_ctr += cinfo->max_v_samp_factor * DCTSIZE;
-    main_ptr->rowgroup_ctr = DCTSIZE;
+        *in_row_ctr += (JDIMENSION)
+          (cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size);
+        mainp->rowgroup_ctr = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
      }
    }

@@ -197,40 +200,40 @@ process_data_buffer_main (j_compress_ptr cinfo,
    if (writing) {
      (*cinfo->prep->pre_process_data) (cinfo,
                                        input_buf, in_row_ctr, in_rows_avail,
-                    main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                    (JDIMENSION) DCTSIZE);
+                                        mainp->buffer, &mainp->rowgroup_ctr,
+                                        (JDIMENSION) cinfo->min_DCT_v_scaled_size);
      /* Return to application if we need more data to fill the iMCU row. */
-      if (main_ptr->rowgroup_ctr < DCTSIZE)
+      if (mainp->rowgroup_ctr < (JDIMENSION) cinfo->min_DCT_v_scaled_size)
        return;
    }

    /* Emit data, unless this is a sink-only pass. */
-    if (main_ptr->pass_mode != JBUF_SAVE_SOURCE) {
-      if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (mainp->pass_mode != JBUF_SAVE_SOURCE) {
+      if (! (*cinfo->coef->compress_data) (cinfo, mainp->buffer)) {
        /* If compressor did not consume the whole row, then we must need to
         * suspend processing and return to the application.  In this situation
         * we pretend we didn't yet consume the last input row; otherwise, if
         * it happened to be the last row of the image, the application would
         * think we were done.
         */
-    if (! main_ptr->suspended) {
+        if (! mainp->suspended) {
          (*in_row_ctr)--;
-      main_ptr->suspended = TRUE;
+          mainp->suspended = TRUE;
        }
        return;
      }
      /* We did finish the row.  Undo our little suspension hack if a previous
-       * call suspended; then mark the main_ptr buffer empty.
+       * call suspended; then mark the main buffer empty.
       */
-      if (main_ptr->suspended) {
+      if (mainp->suspended) {
        (*in_row_ctr)++;
-    main_ptr->suspended = FALSE;
+        mainp->suspended = FALSE;
      }
    }

    /* If get here, we are done with this iMCU row.  Mark buffer empty. */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->cur_iMCU_row++;
+    mainp->rowgroup_ctr = 0;
+    mainp->cur_iMCU_row++;
  }
 }

@@ -238,21 +241,21 @@ process_data_buffer_main (j_compress_ptr cinfo,


 /*
- * Initialize main_ptr buffer controller.
+ * Initialize main buffer controller.
 */

 GLOBAL(void)
 jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr main_ptr;
+  my_main_ptr mainp;
  int ci;
  jpeg_component_info *compptr;

-  main_ptr = (my_main_ptr)
+  mainp = (my_main_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
-  main_ptr->pub.start_pass = start_pass_main;
+  cinfo->main = &mainp->pub;
+  mainp->pub.start_pass = start_pass_main;

  /* We don't need to create a buffer in raw-data mode. */
  if (cinfo->raw_data_in)
@@ -267,27 +270,28 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
    /* Note we pad the bottom to a multiple of the iMCU height */
    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
         ci++, compptr++) {
-      main_ptr->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
+      mainp->whole_image[ci] = (*cinfo->mem->request_virt_sarray)
        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-     compptr->width_in_blocks * DCTSIZE,
-     (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                (long) compptr->v_samp_factor) * DCTSIZE,
-     (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+         compptr->width_in_blocks * ((JDIMENSION) compptr->DCT_h_scaled_size),
+         ((JDIMENSION) jround_up((long) compptr->height_in_blocks,
+                                 (long) compptr->v_samp_factor)) *
+         ((JDIMENSION) cinfo->min_DCT_v_scaled_size),
+         (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
    }
 #else
    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 #endif
  } else {
 #ifdef FULL_MAIN_BUFFER_SUPPORTED
-    main_ptr->whole_image[0] = NULL; /* flag for no virtual arrays */
+    mainp->whole_image[0] = NULL; /* flag for no virtual arrays */
 #endif
    /* Allocate a strip buffer for each component */
    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
         ci++, compptr++) {
-      main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+      mainp->buffer[ci] = (*cinfo->mem->alloc_sarray)
        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     compptr->width_in_blocks * DCTSIZE,
-     (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+         compptr->width_in_blocks * ((JDIMENSION) compptr->DCT_h_scaled_size),
+         (JDIMENSION) (compptr->v_samp_factor * compptr->DCT_v_scaled_size));
    }
  }
 }
--- a/3rdparty/libjpeg/jcmarker.c
+++ b/3rdparty/libjpeg/jcmarker.c
@@ -2,6 +2,7 @@
 * jcmarker.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -72,6 +73,7 @@ typedef enum {			/* JPEG marker codes */
  M_APP15 = 0xef,

  M_JPG0  = 0xf0,
+  M_JPG8  = 0xf8,
  M_JPG13 = 0xfd,
  M_COM   = 0xfe,

@@ -153,21 +155,22 @@ emit_dqt (j_compress_ptr cinfo, int index)
    ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, index);

  prec = 0;
-  for (i = 0; i < DCTSIZE2; i++) {
-    if (qtbl->quantval[i] > 255)
+  for (i = 0; i <= cinfo->lim_Se; i++) {
+    if (qtbl->quantval[cinfo->natural_order[i]] > 255)
      prec = 1;
  }

  if (! qtbl->sent_table) {
    emit_marker(cinfo, M_DQT);

-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo,
+      prec ? cinfo->lim_Se * 2 + 2 + 1 + 2 : cinfo->lim_Se + 1 + 1 + 2);

    emit_byte(cinfo, index + (prec<<4));

-    for (i = 0; i < DCTSIZE2; i++) {
+    for (i = 0; i <= cinfo->lim_Se; i++) {
      /* The table entries must be emitted in zigzag order. */
-      unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
+      unsigned int qval = qtbl->quantval[cinfo->natural_order[i]];
      if (prec)
        emit_byte(cinfo, (int) (qval >> 8));
      emit_byte(cinfo, (int) (qval & 0xFF));
@@ -235,7 +238,11 @@ emit_dac (j_compress_ptr cinfo)

  for (i = 0; i < cinfo->comps_in_scan; i++) {
    compptr = cinfo->cur_comp_info[i];
+    /* DC needs no table for refinement scan */
+    if (cinfo->Ss == 0 && cinfo->Ah == 0)
      dc_in_use[compptr->dc_tbl_no] = 1;
+    /* AC needs no table when not present */
+    if (cinfo->Se)
      ac_in_use[compptr->ac_tbl_no] = 1;
  }

@@ -243,6 +250,7 @@ emit_dac (j_compress_ptr cinfo)
  for (i = 0; i < NUM_ARITH_TBLS; i++)
    length += dc_in_use[i] + ac_in_use[i];

+  if (length) {
    emit_marker(cinfo, M_DAC);

    emit_2bytes(cinfo, length*2 + 2);
@@ -257,6 +265,7 @@ emit_dac (j_compress_ptr cinfo)
        emit_byte(cinfo, cinfo->arith_ac_K[i]);
      }
    }
+  }
 #endif /* C_ARITH_CODING_SUPPORTED */
 }

@@ -273,6 +282,37 @@ emit_dri (j_compress_ptr cinfo)
 }


+LOCAL(void)
+emit_lse_ict (j_compress_ptr cinfo)
+/* Emit an LSE inverse color transform specification marker */
+{
+  /* Support only 1 transform */
+  if (cinfo->color_transform != JCT_SUBTRACT_GREEN ||
+      cinfo->num_components < 3)
+    ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+
+  emit_marker(cinfo, M_JPG8);
+
+  emit_2bytes(cinfo, 24);	/* fixed length */
+
+  emit_byte(cinfo, 0x0D);	/* ID inverse transform specification */
+  emit_2bytes(cinfo, MAXJSAMPLE);	/* MAXTRANS */
+  emit_byte(cinfo, 3);		/* Nt=3 */
+  emit_byte(cinfo, cinfo->comp_info[1].component_id);
+  emit_byte(cinfo, cinfo->comp_info[0].component_id);
+  emit_byte(cinfo, cinfo->comp_info[2].component_id);
+  emit_byte(cinfo, 0x80);	/* F1: CENTER1=1, NORM1=0 */
+  emit_2bytes(cinfo, 0);	/* A(1,1)=0 */
+  emit_2bytes(cinfo, 0);	/* A(1,2)=0 */
+  emit_byte(cinfo, 0);		/* F2: CENTER2=0, NORM2=0 */
+  emit_2bytes(cinfo, 1);	/* A(2,1)=1 */
+  emit_2bytes(cinfo, 0);	/* A(2,2)=0 */
+  emit_byte(cinfo, 0);		/* F3: CENTER3=0, NORM3=0 */
+  emit_2bytes(cinfo, 1);	/* A(3,1)=1 */
+  emit_2bytes(cinfo, 0);	/* A(3,2)=0 */
+}
+
+
 LOCAL(void)
 emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
@@ -285,13 +325,13 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
  emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */

  /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->image_height > 65535L ||
-      (long) cinfo->image_width > 65535L)
+  if ((long) cinfo->jpeg_height > 65535L ||
+      (long) cinfo->jpeg_width > 65535L)
    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);

  emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->image_height);
-  emit_2bytes(cinfo, (int) cinfo->image_width);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_height);
+  emit_2bytes(cinfo, (int) cinfo->jpeg_width);

  emit_byte(cinfo, cinfo->num_components);

@@ -320,22 +360,16 @@ emit_sos (j_compress_ptr cinfo)
  for (i = 0; i < cinfo->comps_in_scan; i++) {
    compptr = cinfo->cur_comp_info[i];
    emit_byte(cinfo, compptr->component_id);
-    td = compptr->dc_tbl_no;
-    ta = compptr->ac_tbl_no;
-    if (cinfo->progressive_mode) {
-      /* Progressive mode: only DC or only AC tables are used in one scan;
-       * furthermore, Huffman coding of DC refinement uses no table at all.
-       * We emit 0 for unused field(s); this is recommended by the P&M text
+
+    /* We emit 0 for unused field(s); this is recommended by the P&M text
     * but does not seem to be specified in the standard.
     */
-      if (cinfo->Ss == 0) {
-    ta = 0;			/* DC scan */
-    if (cinfo->Ah != 0 && !cinfo->arith_code)
-      td = 0;		/* no DC table either */
-      } else {
-    td = 0;			/* AC scan */
-      }
-    }
+
+    /* DC needs no table for refinement scan */
+    td = cinfo->Ss == 0 && cinfo->Ah == 0 ? compptr->dc_tbl_no : 0;
+    /* AC needs no table when not present */
+    ta = cinfo->Se ? compptr->ac_tbl_no : 0;
+
    emit_byte(cinfo, (td << 4) + ta);
  }

@@ -345,6 +379,22 @@ emit_sos (j_compress_ptr cinfo)
 }


+LOCAL(void)
+emit_pseudo_sos (j_compress_ptr cinfo)
+/* Emit a pseudo SOS marker */
+{
+  emit_marker(cinfo, M_SOS);
+
+  emit_2bytes(cinfo, 2 + 1 + 3); /* length */
+
+  emit_byte(cinfo, 0); /* Ns */
+
+  emit_byte(cinfo, 0); /* Ss */
+  emit_byte(cinfo, cinfo->block_size * cinfo->block_size - 1); /* Se */
+  emit_byte(cinfo, 0); /* Ah/Al */
+}
+
+
 LOCAL(void)
 emit_jfif_app0 (j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
@@ -484,7 +534,8 @@ write_file_header (j_compress_ptr cinfo)

 /*
 * Write frame header.
- * This consists of DQT and SOFn markers.
+ * This consists of DQT and SOFn markers,
+ * a conditional LSE marker and a conditional pseudo SOS marker.
 * Note that we do not emit the SOF until we have emitted the DQT(s).
 * This avoids compatibility problems with incorrect implementations that
 * try to error-check the quant table numbers as soon as they see the SOF.
@@ -511,7 +562,7 @@ write_frame_header (j_compress_ptr cinfo)
   * Note we assume that Huffman table numbers won't be changed later.
   */
  if (cinfo->arith_code || cinfo->progressive_mode ||
-      cinfo->data_precision != 8) {
+      cinfo->data_precision != 8 || cinfo->block_size != DCTSIZE) {
    is_baseline = FALSE;
  } else {
    is_baseline = TRUE;
@@ -529,7 +580,10 @@ write_frame_header (j_compress_ptr cinfo)

  /* Emit the proper SOF marker */
  if (cinfo->arith_code) {
-    emit_sof(cinfo, M_SOF9);	/* SOF code for arithmetic coding */
+    if (cinfo->progressive_mode)
+      emit_sof(cinfo, M_SOF10); /* SOF code for progressive arithmetic */
+    else
+      emit_sof(cinfo, M_SOF9);  /* SOF code for sequential arithmetic */
  } else {
    if (cinfo->progressive_mode)
      emit_sof(cinfo, M_SOF2);	/* SOF code for progressive Huffman */
@@ -538,6 +592,14 @@ write_frame_header (j_compress_ptr cinfo)
    else
      emit_sof(cinfo, M_SOF1);	/* SOF code for non-baseline Huffman file */
  }
+
+  /* Check to emit LSE inverse color transform specification marker */
+  if (cinfo->color_transform)
+    emit_lse_ict(cinfo);
+
+  /* Check to emit pseudo SOS marker */
+  if (cinfo->progressive_mode && cinfo->block_size != DCTSIZE)
+    emit_pseudo_sos(cinfo);
 }


@@ -566,20 +628,13 @@ write_scan_header (j_compress_ptr cinfo)
     */
    for (i = 0; i < cinfo->comps_in_scan; i++) {
      compptr = cinfo->cur_comp_info[i];
-      if (cinfo->progressive_mode) {
-    /* Progressive mode: only DC or only AC tables are used in one scan */
-    if (cinfo->Ss == 0) {
-      if (cinfo->Ah == 0)	/* DC needs no table for refinement scan */
+      /* DC needs no table for refinement scan */
+      if (cinfo->Ss == 0 && cinfo->Ah == 0)
        emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
-    } else {
+      /* AC needs no table when not present */
+      if (cinfo->Se)
        emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
    }
-      } else {
-    /* Sequential mode: need both DC and AC tables */
-    emit_dht(cinfo, compptr->dc_tbl_no, FALSE);
-    emit_dht(cinfo, compptr->ac_tbl_no, TRUE);
-      }
-    }
  }

  /* Emit DRI if required --- note that DRI value could change for each scan.
@@ -650,7 +705,7 @@ jinit_marker_writer (j_compress_ptr cinfo)
  marker = (my_marker_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_marker_writer));
-  cinfo->marker = (struct jpeg_marker_writer *) marker;
+  cinfo->marker = &marker->pub;
  /* Initialize method pointers */
  marker->pub.write_file_header = write_file_header;
  marker->pub.write_frame_header = write_frame_header;
--- a/3rdparty/libjpeg/jcmaster.c
+++ b/3rdparty/libjpeg/jcmaster.c
@@ -2,6 +2,7 @@
 * jcmaster.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2003-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -42,23 +43,220 @@ typedef my_comp_master * my_master_ptr;
 * Support routines that do various essential calculations.
 */

-LOCAL(void)
-initial_setup (j_compress_ptr cinfo)
+/*
+ * Compute JPEG image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
-  int ci;
+#ifdef DCT_SCALING_SUPPORTED
+
+  /* Sanity check on input image dimensions to prevent overflow in
+   * following calculation.
+   * We do check jpeg_width and jpeg_height in initial_setup below,
+   * but image_width and image_height can come from arbitrary data,
+   * and we need some space for multiplication by block_size.
+   */
+  if (((long) cinfo->image_width >> 24) || ((long) cinfo->image_height >> 24))
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+
+  /* Compute actual JPEG image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/1 scaling */
+    cinfo->jpeg_width = cinfo->image_width * cinfo->block_size;
+    cinfo->jpeg_height = cinfo->image_height * cinfo->block_size;
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * 2 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/2 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 2L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 2L);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * 3 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/3 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 3L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 3L);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * 4 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/4 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 4L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 4L);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * 5 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/5 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 5L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 5L);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * 6 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/6 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 6L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 6L);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * 7 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/7 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 7L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 7L);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * 8 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/8 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 8L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 8L);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * 9 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/9 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 9L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 9L);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * 10 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/10 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 10L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 10L);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * 11 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/11 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 11L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 11L);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * 12 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/12 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 12L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 12L);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * 13 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/13 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 13L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 13L);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * 14 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/14 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 14L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 14L);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * 15 >= cinfo->scale_denom * cinfo->block_size) {
+    /* Provide block_size/15 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 15L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 15L);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide block_size/16 scaling */
+    cinfo->jpeg_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * cinfo->block_size, 16L);
+    cinfo->jpeg_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * cinfo->block_size, 16L);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+#else /* !DCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+
+#endif /* DCT_SCALING_SUPPORTED */
+}
+
+
+LOCAL(void)
+jpeg_calc_trans_dimensions (j_compress_ptr cinfo)
+{
+  if (cinfo->min_DCT_h_scaled_size != cinfo->min_DCT_v_scaled_size)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+             cinfo->min_DCT_h_scaled_size, cinfo->min_DCT_v_scaled_size);
+
+  cinfo->block_size = cinfo->min_DCT_h_scaled_size;
+}
+
+
+LOCAL(void)
+initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+/* Do computations that are needed before master selection phase */
+{
+  int ci, ssize;
  jpeg_component_info *compptr;
  long samplesperrow;
  JDIMENSION jd_samplesperrow;

+  if (transcode_only)
+    jpeg_calc_trans_dimensions(cinfo);
+  else
+    jpeg_calc_jpeg_dimensions(cinfo);
+
+  /* Sanity check on block_size */
+  if (cinfo->block_size < 1 || cinfo->block_size > 16)
+    ERREXIT2(cinfo, JERR_BAD_DCTSIZE, cinfo->block_size, cinfo->block_size);
+
+  /* Derive natural_order from block_size */
+  switch (cinfo->block_size) {
+  case 2: cinfo->natural_order = jpeg_natural_order2; break;
+  case 3: cinfo->natural_order = jpeg_natural_order3; break;
+  case 4: cinfo->natural_order = jpeg_natural_order4; break;
+  case 5: cinfo->natural_order = jpeg_natural_order5; break;
+  case 6: cinfo->natural_order = jpeg_natural_order6; break;
+  case 7: cinfo->natural_order = jpeg_natural_order7; break;
+  default: cinfo->natural_order = jpeg_natural_order; break;
+  }
+
+  /* Derive lim_Se from block_size */
+  cinfo->lim_Se = cinfo->block_size < DCTSIZE ?
+    cinfo->block_size * cinfo->block_size - 1 : DCTSIZE2-1;
+
  /* Sanity check on image dimensions */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->jpeg_height <= 0 || cinfo->jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
    ERREXIT(cinfo, JERR_EMPTY_IMAGE);

  /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
+  if ((long) cinfo->jpeg_height > (long) JPEG_MAX_DIMENSION ||
+      (long) cinfo->jpeg_width > (long) JPEG_MAX_DIMENSION)
    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);

  /* Width of an input scanline must be representable as JDIMENSION. */
@@ -95,22 +293,52 @@ initial_setup (j_compress_ptr cinfo)
       ci++, compptr++) {
    /* Fill in the correct component_index value; don't rely on application */
    compptr->component_index = ci;
-    /* For compression, we never do DCT scaling. */
-    compptr->DCT_scaled_size = DCTSIZE;
+    /* In selecting the actual DCT scaling for each component, we try to
+     * scale down the chroma components via DCT scaling rather than downsampling.
+     * This saves time if the downsampler gets to use 1:1 scaling.
+     * Note this code adapts subsampling ratios which are powers of 2.
+     */
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+           (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+           (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+#ifdef DCT_SCALING_SUPPORTED
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+           (cinfo->do_fancy_downsampling ? DCTSIZE : DCTSIZE / 2) &&
+           (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+#endif
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support DCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+        compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+        compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
+
    /* Size in DCT blocks */
    compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-            (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width * (long) compptr->h_samp_factor,
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-            (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height * (long) compptr->v_samp_factor,
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
    /* Size in samples */
    compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-            (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_width *
+                    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-            (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long) cinfo->jpeg_height *
+                    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
    /* Mark component needed (this flag isn't actually used for compression) */
    compptr->component_needed = TRUE;
  }
@@ -119,8 +347,8 @@ initial_setup (j_compress_ptr cinfo)
   * main controller will call coefficient controller).
   */
  cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-          (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long) cinfo->jpeg_height,
+                  (long) (cinfo->max_v_samp_factor * cinfo->block_size));
 }


@@ -260,6 +488,39 @@ validate_script (j_compress_ptr cinfo)
  }
 }

+
+LOCAL(void)
+reduce_script (j_compress_ptr cinfo)
+/* Adapt scan script for use with reduced block size;
+ * assume that script has been validated before.
+ */
+{
+  jpeg_scan_info * scanptr;
+  int idxout, idxin;
+
+  /* Circumvent const declaration for this function */
+  scanptr = (jpeg_scan_info *) cinfo->scan_info;
+  idxout = 0;
+
+  for (idxin = 0; idxin < cinfo->num_scans; idxin++) {
+    /* After skipping, idxout becomes smaller than idxin */
+    if (idxin != idxout)
+      /* Copy rest of data;
+       * note we stay in given chunk of allocated memory.
+       */
+      scanptr[idxout] = scanptr[idxin];
+    if (scanptr[idxout].Ss > cinfo->lim_Se)
+      /* Entire scan out of range - skip this entry */
+      continue;
+    if (scanptr[idxout].Se > cinfo->lim_Se)
+      /* Limit scan to end of block */
+      scanptr[idxout].Se = cinfo->lim_Se;
+    idxout++;
+  }
+
+  cinfo->num_scans = idxout;
+}
+
 #endif /* C_MULTISCAN_FILES_SUPPORTED */


@@ -280,10 +541,13 @@ select_scan_parameters (j_compress_ptr cinfo)
      cinfo->cur_comp_info[ci] =
        &cinfo->comp_info[scanptr->component_index[ci]];
    }
+    if (cinfo->progressive_mode) {
      cinfo->Ss = scanptr->Ss;
      cinfo->Se = scanptr->Se;
      cinfo->Ah = scanptr->Ah;
      cinfo->Al = scanptr->Al;
+      return;
+    }
  }
  else
 #endif
@@ -296,12 +560,12 @@ select_scan_parameters (j_compress_ptr cinfo)
    for (ci = 0; ci < cinfo->num_components; ci++) {
      cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
    }
+  }
  cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
+  cinfo->Se = cinfo->block_size * cinfo->block_size - 1;
  cinfo->Ah = 0;
  cinfo->Al = 0;
 }
-}


 LOCAL(void)
@@ -325,7 +589,7 @@ per_scan_setup (j_compress_ptr cinfo)
    compptr->MCU_width = 1;
    compptr->MCU_height = 1;
    compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = DCTSIZE;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
    compptr->last_col_width = 1;
    /* For noninterleaved scans, it is convenient to define last_row_height
     * as the number of block rows present in the last iMCU row.
@@ -347,11 +611,11 @@ per_scan_setup (j_compress_ptr cinfo)

    /* Overall image size in MCUs */
    cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-            (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_width,
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-            (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long) cinfo->jpeg_height,
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));

    cinfo->blocks_in_MCU = 0;

@@ -361,7 +625,7 @@ per_scan_setup (j_compress_ptr cinfo)
      compptr->MCU_width = compptr->h_samp_factor;
      compptr->MCU_height = compptr->v_samp_factor;
      compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
      /* Figure number of non-dummy blocks in last MCU column & row */
      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
      if (tmp == 0) tmp = compptr->MCU_width;
@@ -433,7 +697,7 @@ prepare_for_pass (j_compress_ptr cinfo)
    /* Do Huffman optimization for a scan after the first one. */
    select_scan_parameters(cinfo);
    per_scan_setup(cinfo);
-    if (cinfo->Ss != 0 || cinfo->Ah == 0 || cinfo->arith_code) {
+    if (cinfo->Ss != 0 || cinfo->Ah == 0) {
      (*cinfo->entropy->start_pass) (cinfo, TRUE);
      (*cinfo->coef->start_pass) (cinfo, JBUF_CRANK_DEST);
      master->pub.call_pass_startup = FALSE;
@@ -554,11 +818,13 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
  master->pub.is_last_pass = FALSE;

  /* Validate parameters, determine derived values */
-  initial_setup(cinfo);
+  initial_setup(cinfo, transcode_only);

  if (cinfo->scan_info != NULL) {
 #ifdef C_MULTISCAN_FILES_SUPPORTED
    validate_script(cinfo);
+    if (cinfo->block_size < DCTSIZE)
+      reduce_script(cinfo);
 #else
    ERREXIT(cinfo, JERR_NOT_COMPILED);
 #endif
@@ -567,8 +833,10 @@ jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
    cinfo->num_scans = 1;
  }

-  if (cinfo->progressive_mode)	/*  TEMPORARY HACK ??? */
-    cinfo->optimize_coding = TRUE; /* assume default tables no good for progressive mode */
+  if ((cinfo->progressive_mode || cinfo->block_size < DCTSIZE) &&
+      !cinfo->arith_code)			/*  TEMPORARY HACK ??? */
+    /* assume default tables no good for progressive or downscale mode */
+    cinfo->optimize_coding = TRUE;

  /* Initialize my private state */
  if (transcode_only) {
--- a/3rdparty/libjpeg/jconfig.h
+++ b/3rdparty/libjpeg/jconfig.h
@@ -1,40 +1,83 @@
-/* jconfig.h.  Generated automatically by configure.  */
-/* jconfig.cfg --- source file edited by configure script */
-/* see jconfig.doc for explanations */
-
 #define HAVE_PROTOTYPES
 #define HAVE_UNSIGNED_CHAR
 #define HAVE_UNSIGNED_SHORT
-/*#undef void*/
-/*#undef const*/
+
+/* Define this if an ordinary "char" type is unsigned.
+ * If you're not sure, leaving it undefined will work at some cost in speed.
+ * If you defined HAVE_UNSIGNED_CHAR then the speed difference is minimal.
+ */
 #undef CHAR_IS_UNSIGNED

 #if defined __MINGW__ || defined __MINGW32__ || (!defined WIN32 && !defined _WIN32)
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
 #define HAVE_STDDEF_H
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
 #define HAVE_STDLIB_H
 #endif

+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
 #undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
 #undef NEED_SYS_TYPES_H
+
+/* For 80x86 machines, you need to define NEED_FAR_POINTERS,
+ * unless you are using a large-data memory model or 80386 flat-memory mode.
+ * On less brain-damaged CPUs this symbol must not be defined.
+ * (Defining this symbol causes large data structures to be referenced through
+ * "far" pointers and to be allocated with a special version of malloc.)
+ */
 #undef NEED_FAR_POINTERS
+
+/* Define this if your linker needs global names to be unique in less
+ * than the first 15 characters.
+ */
 #undef NEED_SHORT_EXTERNAL_NAMES
-/* Define this if you get warnings about undefined structures. */
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
 #undef INCOMPLETE_TYPES_BROKEN

-#if defined WIN32 || defined _WIN32
-/* Define "boolean" as unsigned char, not int, per Windows custom */
+/* Define "boolean" as unsigned char, not int, on Windows systems.
+ */
+#ifdef _WIN32
 #ifndef __RPCNDR_H__		/* don't conflict if rpcndr.h already read */
 typedef unsigned char boolean;
 #endif
 #define HAVE_BOOLEAN		/* prevent jmorecfg.h from redefining it */
 #endif

+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
 #ifdef JPEG_INTERNALS

+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
 #undef RIGHT_SHIFT_IS_UNSIGNED

 /* These are for configuring the JPEG memory manager. */
-#define DEFAULT_MAX_MEM 1073741824
+#define DEFAULT_MAX_MEM 1073741824 /*1Gb*/

 #if !defined WIN32 && !defined _WIN32
 #define INLINE __inline__
@@ -43,14 +86,28 @@ typedef unsigned char boolean;

 #endif /* JPEG_INTERNALS */

+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
 #ifdef JPEG_CJPEG_DJPEG

+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
 #define BMP_SUPPORTED		/* BMP image file format */
 #define GIF_SUPPORTED		/* GIF image file format */
 #define PPM_SUPPORTED		/* PBMPLUS PPM/PGM image file format */
 #undef RLE_SUPPORTED		/* Utah RLE image file format */
 #define TARGA_SUPPORTED		/* Targa image file format */

+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
 #if defined WIN32 || defined _WIN32
 #define TWO_FILE_COMMANDLINE	/* optional */
 #define USE_SETMODE		/* Microsoft has setmode() */
@@ -58,10 +115,22 @@ typedef unsigned char boolean;
 #undef TWO_FILE_COMMANDLINE
 #endif

+/* Define this if your system needs explicit cleanup of temporary files.
+ * This is crucial under MS-DOS, where the temporary "files" may be areas
+ * of extended memory; on most other systems it's not as important.
+ */
 #undef NEED_SIGNAL_CATCHER
+
+/* By default, we open image files with fopen(...,"rb") or fopen(...,"wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
 #undef DONT_USE_B_MODE

-/* Define this if you want percent-done progress reports from cjpeg/djpeg. */
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
 #undef PROGRESS_REPORT

+
 #endif /* JPEG_CJPEG_DJPEG */
--- a/3rdparty/libjpeg/jcparam.c
+++ b/3rdparty/libjpeg/jcparam.c
@@ -2,6 +2,7 @@
 * jcparam.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2003-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -60,15 +61,6 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }


-GLOBAL(void)
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-             boolean force_baseline)
-/* Set or change the 'quality' (quantization) setting, using default tables
- * and a straight percentage-scaling quality scale.  In most cases it's better
- * to use jpeg_set_quality (below); this entry point is provided for
- * applications that insist on a linear percentage scaling.
- */
-{
 /* These are the sample quantization tables given in JPEG spec section K.1.
 * The spec says that the values given produce "good" quality, and
 * when divided by 2, "very good" quality.
@@ -94,6 +86,31 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
  99,  99,  99,  99,  99,  99,  99,  99
 };

+
+GLOBAL(void)
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and straight percentage-scaling quality scales.
+ * This entry point allows different scalings for luminance and chrominance.
+ */
+{
+  /* Set up two quantization tables using the specified scaling */
+  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
+                       cinfo->q_scale_factor[0], force_baseline);
+  jpeg_add_quant_table(cinfo, 1, std_chrominance_quant_tbl,
+                       cinfo->q_scale_factor[1], force_baseline);
+}
+
+
+GLOBAL(void)
+jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+                         boolean force_baseline)
+/* Set or change the 'quality' (quantization) setting, using default tables
+ * and a straight percentage-scaling quality scale.  In most cases it's better
+ * to use jpeg_set_quality (below); this entry point is provided for
+ * applications that insist on a linear percentage scaling.
+ */
+{
  /* Set up two quantization tables using the specified scaling */
  jpeg_add_quant_table(cinfo, 0, std_luminance_quant_tbl,
                       scale_factor, force_baseline);
@@ -133,7 +150,7 @@ jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables.
 * This is the standard quality-adjusting entry point for typical user
 * interfaces; only those who want detailed control over quantization tables
- * would use the preceding three routines directly.
+ * would use the preceding routines directly.
 */
 {
  /* Convert user 0-100 rating to percentage scaling */
@@ -284,6 +301,8 @@ jpeg_set_defaults (j_compress_ptr cinfo)

  /* Initialize everything not dependent on the color space */

+  cinfo->scale_num = 1;		/* 1:1 scaling */
+  cinfo->scale_denom = 1;
  cinfo->data_precision = BITS_IN_JSAMPLE;
  /* Set up two quantization tables using default quality of 75 */
  jpeg_set_quality(cinfo, 75, TRUE);
@@ -320,6 +339,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  /* By default, use the simpler non-cosited sampling alignment */
  cinfo->CCIR601_sampling = FALSE;

+  /* By default, apply fancy downsampling */
+  cinfo->do_fancy_downsampling = TRUE;
+
  /* No input smoothing */
  cinfo->smoothing_factor = 0;

@@ -345,6 +367,9 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  cinfo->X_density = 1;		/* Pixel aspect ratio is square by default */
  cinfo->Y_density = 1;

+  /* No color transform */
+  cinfo->color_transform = JCT_NONE;
+
  /* Choose JPEG colorspace based on input space, set defaults accordingly */

  jpeg_default_colorspace(cinfo);
@@ -426,7 +451,9 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
    cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
    cinfo->num_components = 3;
    SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
+    SET_COMP(1, 0x47 /* 'G' */, 1,1, 0,
+                cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0,
+                cinfo->color_transform == JCT_SUBTRACT_GREEN ? 1 : 0);
    SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0);
    break;
  case JCS_YCbCr:
--- a/3rdparty/libjpeg/jcphuff.c
+++ b/3rdparty/libjpeg/jcphuff.c
@@ -1,833 +0,0 @@
-/*
- * jcphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy encoding routines for progressive JPEG.
- *
- * We do not support output suspension in this module, since the library
- * currently does not allow multiple-scan files to be written with output
- * suspension.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jchuff.h"		/* Declarations shared with jchuff.c */
-
-#ifdef C_PROGRESSIVE_SUPPORTED
-
-/* Expanded entropy encoder object for progressive Huffman encoding. */
-
-typedef struct {
-  struct jpeg_entropy_encoder pub; /* public fields */
-
-  /* Mode flag: TRUE for optimization, FALSE for actual data output */
-  boolean gather_statistics;
-
-  /* Bit-level coding status.
-   * next_output_byte/free_in_buffer are local copies of cinfo->dest fields.
-   */
-  JOCTET * next_output_byte;	/* => next byte to write in buffer */
-  size_t free_in_buffer;	/* # of byte spaces remaining in buffer */
-  INT32 put_buffer;		/* current bit-accumulation buffer */
-  int put_bits;			/* # of bits now in it */
-  j_compress_ptr cinfo;		/* link to cinfo (needed for dump_buffer) */
-
-  /* Coding status for DC components */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-
-  /* Coding status for AC components */
-  int ac_tbl_no;		/* the table number of the single component */
-  unsigned int EOBRUN;		/* run length of EOBs */
-  unsigned int BE;		/* # of buffered correction bits before MCU */
-  char * bit_buffer;		/* buffer for correction bits (1 per char) */
-  /* packing correction bits tightly would save some space but cost time... */
-
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-  int next_restart_num;		/* next restart number to write (0-7) */
-
-  /* Pointers to derived tables (these workspaces have image lifespan).
-   * Since any one scan codes only DC or only AC, we only need one set
-   * of tables, not one for DC and one for AC.
-   */
-  c_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  /* Statistics tables for optimization; again, one set is enough */
-  long * count_ptrs[NUM_HUFF_TBLS];
-} phuff_entropy_encoder;
-
-typedef phuff_entropy_encoder * phuff_entropy_ptr;
-
-/* MAX_CORR_BITS is the number of bits the AC refinement correction-bit
- * buffer can hold.  Larger sizes may slightly improve compression, but
- * 1000 is already well into the realm of overkill.
- * The minimum safe size is 64 bits.
- */
-
-#define MAX_CORR_BITS  1000	/* Max # of correction bits I can buffer */
-
-/* IRIGHT_SHIFT is like RIGHT_SHIFT, but works on int rather than INT32.
- * We assume that int right shift is unsigned if INT32 right shift is,
- * which should be safe.
- */
-
-#ifdef RIGHT_SHIFT_IS_UNSIGNED
-#define ISHIFT_TEMPS	int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-    ((ishift_temp = (x)) < 0 ? \
-     (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-     (ishift_temp >> (shft)))
-#else
-#define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)	((x) >> (shft))
-#endif
-
-/* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first JPP((j_compress_ptr cinfo,
-                        JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_first JPP((j_compress_ptr cinfo,
-                        JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_DC_refine JPP((j_compress_ptr cinfo,
-                         JBLOCKROW *MCU_data));
-METHODDEF(boolean) encode_mcu_AC_refine JPP((j_compress_ptr cinfo,
-                         JBLOCKROW *MCU_data));
-METHODDEF(void) finish_pass_phuff JPP((j_compress_ptr cinfo));
-METHODDEF(void) finish_pass_gather_phuff JPP((j_compress_ptr cinfo));
-
-
-/*
- * Initialize for a Huffman-compressed scan using progressive JPEG.
- */
-
-METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-
-  entropy->cinfo = cinfo;
-  entropy->gather_statistics = gather_statistics;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* We assume jcmaster.c already validated the scan parameters. */
-
-  /* Select execution routines */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_first;
-    else
-      entropy->pub.encode_mcu = encode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.encode_mcu = encode_mcu_DC_refine;
-    else {
-      entropy->pub.encode_mcu = encode_mcu_AC_refine;
-      /* AC refinement needs a correction bit buffer */
-      if (entropy->bit_buffer == NULL)
-    entropy->bit_buffer = (char *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                      MAX_CORR_BITS * SIZEOF(char));
-    }
-  }
-  if (gather_statistics)
-    entropy->pub.finish_pass = finish_pass_gather_phuff;
-  else
-    entropy->pub.finish_pass = finish_pass_phuff;
-
-  /* Only DC coefficients may be interleaved, so cinfo->comps_in_scan = 1
-   * for AC coefficients.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Initialize DC predictions to 0 */
-    entropy->last_dc_val[ci] = 0;
-    /* Get table index */
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-    continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      entropy->ac_tbl_no = tbl = compptr->ac_tbl_no;
-    }
-    if (gather_statistics) {
-      /* Check for invalid table index */
-      /* (make_c_derived_tbl does this in the other path) */
-      if (tbl < 0 || tbl >= NUM_HUFF_TBLS)
-        ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, tbl);
-      /* Allocate and zero the statistics tables */
-      /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
-      if (entropy->count_ptrs[tbl] == NULL)
-    entropy->count_ptrs[tbl] = (long *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                      257 * SIZEOF(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * SIZEOF(long));
-    } else {
-      /* Compute derived values for Huffman table */
-      /* We may do this more than once for a table, but it's not expensive */
-      jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-                  & entropy->derived_tbls[tbl]);
-    }
-  }
-
-  /* Initialize AC stuff */
-  entropy->EOBRUN = 0;
-  entropy->BE = 0;
-
-  /* Initialize bit buffer to empty */
-  entropy->put_buffer = 0;
-  entropy->put_bits = 0;
-
-  /* Initialize restart stuff */
-  entropy->restarts_to_go = cinfo->restart_interval;
-  entropy->next_restart_num = 0;
-}
-
-
-/* Outputting bytes to the file.
- * NB: these must be called only when actually outputting,
- * that is, entropy->gather_statistics == FALSE.
- */
-
-/* Emit a byte */
-#define emit_byte(entropy,val)  \
-    { *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-      if (--(entropy)->free_in_buffer == 0)  \
-        dump_buffer(entropy); }
-
-
-LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
-/* Empty the output buffer; we do not support suspension in this module. */
-{
-  struct jpeg_destination_mgr * dest = entropy->cinfo->dest;
-
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
-    ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
-  /* After a successful buffer dump, must reset buffer pointers */
-  entropy->next_output_byte = dest->next_output_byte;
-  entropy->free_in_buffer = dest->free_in_buffer;
-}
-
-
-/* Outputting bits to the file */
-
-/* Only the right 24 bits of put_buffer are used; the valid bits are
- * left-justified in this part.  At most 16 bits can be passed to emit_bits
- * in one call, and we never retain more than 7 bits in put_buffer
- * between calls, so 24 bits are sufficient.
- */
-
-INLINE
-LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
-/* Emit some bits, unless we are in gather mode */
-{
-  /* This routine is heavily used, so it's worth coding tightly. */
-  register INT32 put_buffer = (INT32) code;
-  register int put_bits = entropy->put_bits;
-
-  /* if size is 0, caller used an invalid Huffman table entry */
-  if (size == 0)
-    ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-  if (entropy->gather_statistics)
-    return;			/* do nothing if we're only getting stats */
-
-  put_buffer &= (((INT32) 1)<<size) - 1; /* mask off any extra bits in code */
-
-  put_bits += size;		/* new number of bits in buffer */
-
-  put_buffer <<= 24 - put_bits; /* align incoming bits */
-
-  put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
-
-  while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
-
-    emit_byte(entropy, c);
-    if (c == 0xFF) {		/* need to stuff a zero byte? */
-      emit_byte(entropy, 0);
-    }
-    put_buffer <<= 8;
-    put_bits -= 8;
-  }
-
-  entropy->put_buffer = put_buffer; /* update variables */
-  entropy->put_bits = put_bits;
-}
-
-
-LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
-{
-  emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
-  entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
-  entropy->put_bits = 0;
-}
-
-
-/*
- * Emit (or just count) a Huffman symbol.
- */
-
-INLINE
-LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
-{
-  if (entropy->gather_statistics)
-    entropy->count_ptrs[tbl_no][symbol]++;
-  else {
-    c_derived_tbl * tbl = entropy->derived_tbls[tbl_no];
-    emit_bits(entropy, tbl->ehufco[symbol], tbl->ehufsi[symbol]);
-  }
-}
-
-
-/*
- * Emit bits from a correction bit buffer.
- */
-
-LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char * bufstart,
-            unsigned int nbits)
-{
-  if (entropy->gather_statistics)
-    return;			/* no real work */
-
-  while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
-    bufstart++;
-    nbits--;
-  }
-}
-
-
-/*
- * Emit any pending EOBRUN symbol.
- */
-
-LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
-{
-  register int temp, nbits;
-
-  if (entropy->EOBRUN > 0) {	/* if there is any pending EOBRUN */
-    temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
-    /* safety check: shouldn't happen given limited correction-bit buffer */
-    if (nbits > 14)
-      ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
-
-    emit_symbol(entropy, entropy->ac_tbl_no, nbits << 4);
-    if (nbits)
-      emit_bits(entropy, entropy->EOBRUN, nbits);
-
-    entropy->EOBRUN = 0;
-
-    /* Emit any buffered correction bits */
-    emit_buffered_bits(entropy, entropy->bit_buffer, entropy->BE);
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * Emit a restart marker & resynchronize predictions.
- */
-
-LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
-{
-  int ci;
-
-  emit_eobrun(entropy);
-
-  if (! entropy->gather_statistics) {
-    flush_bits(entropy);
-    emit_byte(entropy, 0xFF);
-    emit_byte(entropy, JPEG_RST0 + restart_num);
-  }
-
-  if (entropy->cinfo->Ss == 0) {
-    /* Re-initialize DC predictions to 0 */
-    for (ci = 0; ci < entropy->cinfo->comps_in_scan; ci++)
-      entropy->last_dc_val[ci] = 0;
-  } else {
-    /* Re-initialize all AC-related fields to 0 */
-    entropy->EOBRUN = 0;
-    entropy->BE = 0;
-  }
-}
-
-
-/*
- * MCU encoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  int blkn, ci;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  jpeg_component_info * compptr;
-  ISHIFT_TEMPS
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-    ci = cinfo->MCU_membership[blkn];
-    compptr = cinfo->cur_comp_info[ci];
-
-    /* Compute the DC value after the required point transform by Al.
-     * This is simply an arithmetic right shift.
-     */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
-
-    /* DC differences are figured on the point-transformed values. */
-    temp = temp2 - entropy->last_dc_val[ci];
-    entropy->last_dc_val[ci] = temp2;
-
-    /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
-
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
-    /* Check for out-of-range coefficient values.
-     * Since we're encoding a difference, the range limit is twice as much.
-     */
-    if (nbits > MAX_COEF_BITS+1)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-
-    /* Count/emit the Huffman-coded symbol for the number of bits */
-    emit_symbol(entropy, compptr->dc_tbl_no, nbits);
-
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    if (nbits)			/* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
-  register int nbits;
-  register int r, k;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
-
-  r = 0;			/* r = run length of zeros */
-
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
-      r++;
-      continue;
-    }
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value; so the code is
-     * interwoven with finding the abs value (temp) and output bits (temp2).
-     */
-    if (temp < 0) {
-      temp = -temp;		/* temp is abs value of input */
-      temp >>= Al;		/* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;		/* apply the point transform */
-      temp2 = temp;
-    }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any pending EOBRUN */
-    if (entropy->EOBRUN > 0)
-      emit_eobrun(entropy);
-    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-    while (r > 15) {
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-    }
-
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;			/* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
-    /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
-
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
-
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0) {			/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    if (entropy->EOBRUN == 0x7FFF)
-      emit_eobrun(entropy);	/* force it out to avoid overflow */
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  int blkn;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data blocks */
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* We simply emit the Al'th bit of the DC coefficient value. */
-    temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * MCU encoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  register int r, k;
-  int EOB;
-  char *BR_buffer;
-  unsigned int BR;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  JBLOCKROW block;
-  int absvalues[DCTSIZE2];
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Emit restart marker if needed */
-  if (cinfo->restart_interval)
-    if (entropy->restarts_to_go == 0)
-      emit_restart(entropy, entropy->next_restart_num);
-
-  /* Encode the MCU data block */
-  block = MCU_data[0];
-
-  /* It is convenient to make a pre-pass to determine the transformed
-   * coefficients' absolute values and the EOB position.
-   */
-  EOB = 0;
-  for (k = cinfo->Ss; k <= Se; k++) {
-    temp = (*block)[jpeg_natural_order[k]];
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value.
-     */
-    if (temp < 0)
-      temp = -temp;		/* temp is abs value of input */
-    temp >>= Al;		/* apply the point transform */
-    absvalues[k] = temp;	/* save abs value for main pass */
-    if (temp == 1)
-      EOB = k;			/* EOB = index of last newly-nonzero coef */
-  }
-
-  /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
-
-  r = 0;			/* r = run length of zeros */
-  BR = 0;			/* BR = count of buffered bits added now */
-  BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
-
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = absvalues[k]) == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any required ZRLs, but not if they can be folded into EOB */
-    while (r > 15 && k <= EOB) {
-      /* emit any pending EOBRUN and the BE correction bits */
-      emit_eobrun(entropy);
-      /* Emit ZRL */
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-      /* Emit buffered correction bits that must be associated with ZRL */
-      emit_buffered_bits(entropy, BR_buffer, BR);
-      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-      BR = 0;
-    }
-
-    /* If the coef was previously nonzero, it only needs a correction bit.
-     * NOTE: a straight translation of the spec's figure G.7 would suggest
-     * that we also need to test r > 15.  But if r > 15, we can only get here
-     * if k > EOB, which implies that this coefficient is not 1.
-     */
-    if (temp > 1) {
-      /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
-      continue;
-    }
-
-    /* Emit any pending EOBRUN and the BE correction bits */
-    emit_eobrun(entropy);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
-
-    /* Emit output bit for newly-nonzero coef */
-    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
-
-    /* Emit buffered correction bits that must be associated with this code */
-    emit_buffered_bits(entropy, BR_buffer, BR);
-    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-    BR = 0;
-    r = 0;			/* reset zero run length */
-  }
-
-  if (r > 0 || BR > 0) {	/* If there are trailing zeroes, */
-    entropy->EOBRUN++;		/* count an EOB */
-    entropy->BE += BR;		/* concat my correction bits to older ones */
-    /* We force out the EOB if we risk either:
-     * 1. overflow of the EOB counter;
-     * 2. overflow of the correction bit buffer during the next MCU.
-     */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
-      emit_eobrun(entropy);
-  }
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-
-  /* Update restart-interval state too */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0) {
-      entropy->restarts_to_go = cinfo->restart_interval;
-      entropy->next_restart_num++;
-      entropy->next_restart_num &= 7;
-    }
-    entropy->restarts_to_go--;
-  }
-
-  return TRUE;
-}
-
-
-/*
- * Finish up at the end of a Huffman-compressed progressive scan.
- */
-
-METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-
-  entropy->next_output_byte = cinfo->dest->next_output_byte;
-  entropy->free_in_buffer = cinfo->dest->free_in_buffer;
-
-  /* Flush out any buffered data */
-  emit_eobrun(entropy);
-  flush_bits(entropy);
-
-  cinfo->dest->next_output_byte = entropy->next_output_byte;
-  cinfo->dest->free_in_buffer = entropy->free_in_buffer;
-}
-
-
-/*
- * Finish up a statistics-gathering pass and create the new Huffman tables.
- */
-
-METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band;
-  int ci, tbl;
-  jpeg_component_info * compptr;
-  JHUFF_TBL **htblptr;
-  boolean did[NUM_HUFF_TBLS];
-
-  /* Flush out buffered data (all we care about is counting the EOB symbol) */
-  emit_eobrun(entropy);
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* It's important not to apply jpeg_gen_optimal_table more than once
-   * per table, because it clobbers the input frequency counts!
-   */
-  MEMZERO(did, SIZEOF(did));
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    if (is_DC_band) {
-      if (cinfo->Ah != 0)	/* DC refinement needs no table */
-    continue;
-      tbl = compptr->dc_tbl_no;
-    } else {
-      tbl = compptr->ac_tbl_no;
-    }
-    if (! did[tbl]) {
-      if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
-      else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
-      if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
-      jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
-      did[tbl] = TRUE;
-    }
-  }
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy encoding.
- */
-
-GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                SIZEOF(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff;
-
-  /* Mark tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-    entropy->count_ptrs[i] = NULL;
-  }
-  entropy->bit_buffer = NULL;	/* needed only in AC refinement scan */
-}
-
-#endif /* C_PROGRESSIVE_SUPPORTED */
--- a/3rdparty/libjpeg/jcprepct.c
+++ b/3rdparty/libjpeg/jcprepct.c
@@ -173,10 +173,12 @@ pre_process_data (j_compress_ptr cinfo,
        *out_row_group_ctr < out_row_groups_avail) {
      for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
           ci++, compptr++) {
+        numrows = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+                  cinfo->min_DCT_v_scaled_size;
        expand_bottom_edge(output_buf[ci],
-               compptr->width_in_blocks * DCTSIZE,
-               (int) (*out_row_group_ctr * compptr->v_samp_factor),
-               (int) (out_row_groups_avail * compptr->v_samp_factor));
+                           compptr->width_in_blocks * compptr->DCT_h_scaled_size,
+                           (int) (*out_row_group_ctr * numrows),
+                           (int) (out_row_groups_avail * numrows));
      }
      *out_row_group_ctr = out_row_groups_avail;
      break;			/* can exit outer loop without test */
@@ -288,7 +290,8 @@ create_context_buffer (j_compress_ptr cinfo)
     */
    true_buffer = (*cinfo->mem->alloc_sarray)
      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+       (JDIMENSION) (((long) compptr->width_in_blocks *
+                      cinfo->min_DCT_h_scaled_size *
                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
       (JDIMENSION) (3 * rgroup_height));
    /* Copy true buffer row pointers into the middle of the fake row array */
@@ -346,7 +349,8 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
         ci++, compptr++) {
      prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
+         (JDIMENSION) (((long) compptr->width_in_blocks *
+                        cinfo->min_DCT_h_scaled_size *
                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
         (JDIMENSION) cinfo->max_v_samp_factor);
    }
--- a/3rdparty/libjpeg/jcsample.c
+++ b/3rdparty/libjpeg/jcsample.c
@@ -62,6 +62,15 @@ typedef struct {

  /* Downsampling method pointers, one per component */
  downsample1_ptr methods[MAX_COMPONENTS];
+
+  /* Height of an output row group for each component. */
+  int rowgroup_height[MAX_COMPONENTS];
+
+  /* These arrays save pixel expansion factors so that int_downsample need not
+   * recompute them each time.  They are unused for other downsampling methods.
+   */
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
 } my_downsampler;

 typedef my_downsampler * my_downsample_ptr;
@@ -123,7 +132,8 @@ sep_downsample (j_compress_ptr cinfo,
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
    in_ptr = input_buf[ci] + in_row_index;
-    out_ptr = output_buf[ci] + (out_row_group_index * compptr->v_samp_factor);
+    out_ptr = output_buf[ci] +
+              (out_row_group_index * downsample->rowgroup_height[ci]);
    (*downsample->methods[ci]) (cinfo, compptr, in_ptr, out_ptr);
  }
 }
@@ -140,14 +150,15 @@ METHODDEF(void)
 int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
  int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
  JDIMENSION outcol, outcol_h;	/* outcol_h == outcol*h_expand */
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
  JSAMPROW inptr, outptr;
  INT32 outvalue;

-  h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
-  v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
+  h_expand = downsample->h_expand[compptr->component_index];
+  v_expand = downsample->v_expand[compptr->component_index];
  numpix = h_expand * v_expand;
  numpix2 = numpix/2;

@@ -158,8 +169,8 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
  expand_right_edge(input_data, cinfo->max_v_samp_factor,
                    cinfo->image_width, output_cols * h_expand);

-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
    outptr = output_data[outrow];
    for (outcol = 0, outcol_h = 0; outcol < output_cols;
         outcol++, outcol_h += h_expand) {
@@ -173,6 +184,7 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
      *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
    }
    inrow += v_expand;
+    outrow++;
  }
 }

@@ -191,8 +203,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
  jcopy_sample_rows(input_data, 0, output_data, 0,
                    cinfo->max_v_samp_factor, cinfo->image_width);
  /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-            cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    compptr->width_in_blocks * compptr->DCT_h_scaled_size);
 }


@@ -212,9 +224,9 @@ METHODDEF(void)
 h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
                 JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
  JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
  register JSAMPROW inptr, outptr;
  register int bias;

@@ -225,9 +237,9 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
  expand_right_edge(input_data, cinfo->max_v_samp_factor,
                    cinfo->image_width, output_cols * 2);

-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
    bias = 0;			/* bias = 0,1,0,1,... for successive samples */
    for (outcol = 0; outcol < output_cols; outcol++) {
      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
@@ -251,7 +263,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  int inrow, outrow;
  JDIMENSION outcol;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
  register JSAMPROW inptr0, inptr1, outptr;
  register int bias;

@@ -262,8 +274,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
  expand_right_edge(input_data, cinfo->max_v_samp_factor,
                    cinfo->image_width, output_cols * 2);

-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
    outptr = output_data[outrow];
    inptr0 = input_data[inrow];
    inptr1 = input_data[inrow+1];
@@ -276,6 +288,7 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
      inptr0 += 2; inptr1 += 2;
    }
    inrow += 2;
+    outrow++;
  }
 }

@@ -294,7 +307,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 {
  int inrow, outrow;
  JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
  register JSAMPROW inptr0, inptr1, above_ptr, below_ptr, outptr;
  INT32 membersum, neighsum, memberscale, neighscale;

@@ -321,8 +334,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
  memberscale = 16384 - cinfo->smoothing_factor * 80; /* scaled (1-5*SF)/4 */
  neighscale = cinfo->smoothing_factor * 16; /* scaled SF/4 */

-  inrow = 0;
-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
+  inrow = outrow = 0;
+  while (inrow < cinfo->max_v_samp_factor) {
    outptr = output_data[outrow];
    inptr0 = input_data[inrow];
    inptr1 = input_data[inrow+1];
@@ -378,6 +391,7 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);

    inrow += 2;
+    outrow++;
  }
 }

@@ -392,9 +406,9 @@ METHODDEF(void)
 fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
                            JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int outrow;
+  int inrow;
  JDIMENSION colctr;
-  JDIMENSION output_cols = compptr->width_in_blocks * DCTSIZE;
+  JDIMENSION output_cols = compptr->width_in_blocks * compptr->DCT_h_scaled_size;
  register JSAMPROW inptr, above_ptr, below_ptr, outptr;
  INT32 membersum, neighsum, memberscale, neighscale;
  int colsum, lastcolsum, nextcolsum;
@@ -415,11 +429,11 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  memberscale = 65536L - cinfo->smoothing_factor * 512L; /* scaled 1-8*SF */
  neighscale = cinfo->smoothing_factor * 64; /* scaled SF */

-  for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
-    outptr = output_data[outrow];
-    inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
+    outptr = output_data[inrow];
+    inptr = input_data[inrow];
+    above_ptr = input_data[inrow-1];
+    below_ptr = input_data[inrow+1];

    /* Special case for first column */
    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
@@ -467,6 +481,7 @@ jinit_downsampler (j_compress_ptr cinfo)
  int ci;
  jpeg_component_info * compptr;
  boolean smoothok = TRUE;
+  int h_in_group, v_in_group, h_out_group, v_out_group;

  downsample = (my_downsample_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
@@ -482,8 +497,17 @@ jinit_downsampler (j_compress_ptr cinfo)
  /* Verify we can handle the sampling factors, and set up method pointers */
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    if (compptr->h_samp_factor == cinfo->max_h_samp_factor &&
-    compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    /* Compute size of an "output group" for DCT scaling.  This many samples
+     * are to be converted from max_h_samp_factor * max_v_samp_factor pixels.
+     */
+    h_out_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+                  cinfo->min_DCT_h_scaled_size;
+    v_out_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+                  cinfo->min_DCT_v_scaled_size;
+    h_in_group = cinfo->max_h_samp_factor;
+    v_in_group = cinfo->max_v_samp_factor;
+    downsample->rowgroup_height[ci] = v_out_group; /* save for use later */
+    if (h_in_group == h_out_group && v_in_group == v_out_group) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
      if (cinfo->smoothing_factor) {
        downsample->methods[ci] = fullsize_smooth_downsample;
@@ -491,12 +515,12 @@ jinit_downsampler (j_compress_ptr cinfo)
      } else
 #endif
        downsample->methods[ci] = fullsize_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-           compptr->v_samp_factor == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+               v_in_group == v_out_group) {
      smoothok = FALSE;
      downsample->methods[ci] = h2v1_downsample;
-    } else if (compptr->h_samp_factor * 2 == cinfo->max_h_samp_factor &&
-           compptr->v_samp_factor * 2 == cinfo->max_v_samp_factor) {
+    } else if (h_in_group == h_out_group * 2 &&
+               v_in_group == v_out_group * 2) {
 #ifdef INPUT_SMOOTHING_SUPPORTED
      if (cinfo->smoothing_factor) {
        downsample->methods[ci] = h2v2_smooth_downsample;
@@ -504,10 +528,12 @@ jinit_downsampler (j_compress_ptr cinfo)
      } else
 #endif
        downsample->methods[ci] = h2v2_downsample;
-    } else if ((cinfo->max_h_samp_factor % compptr->h_samp_factor) == 0 &&
-           (cinfo->max_v_samp_factor % compptr->v_samp_factor) == 0) {
+    } else if ((h_in_group % h_out_group) == 0 &&
+               (v_in_group % v_out_group) == 0) {
      smoothok = FALSE;
      downsample->methods[ci] = int_downsample;
+      downsample->h_expand[ci] = (UINT8) (h_in_group / h_out_group);
+      downsample->v_expand[ci] = (UINT8) (v_in_group / v_out_group);
    } else
      ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
  }
--- a/3rdparty/libjpeg/jctrans.c
+++ b/3rdparty/libjpeg/jctrans.c
@@ -2,6 +2,7 @@
 * jctrans.c
 *
 * Copyright (C) 1995-1998, Thomas G. Lane.
+ * Modified 2000-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -76,11 +77,18 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
  dstinfo->image_height = srcinfo->image_height;
  dstinfo->input_components = srcinfo->num_components;
  dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->jpeg_width = srcinfo->output_width;
+  dstinfo->jpeg_height = srcinfo->output_height;
+  dstinfo->min_DCT_h_scaled_size = srcinfo->min_DCT_h_scaled_size;
+  dstinfo->min_DCT_v_scaled_size = srcinfo->min_DCT_v_scaled_size;
  /* Initialize all parameters to default values */
  jpeg_set_defaults(dstinfo);
  /* jpeg_set_defaults may choose wrong colorspace, eg YCbCr if input is RGB.
   * Fix it to get the right header markers for the image colorspace.
+   * Note: Entropy table assignment in jpeg_set_colorspace depends
+   * on color_transform.
   */
+  dstinfo->color_transform = srcinfo->color_transform;
  jpeg_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
  dstinfo->data_precision = srcinfo->data_precision;
  dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
@@ -125,7 +133,7 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
          ERREXIT1(dstinfo, JERR_MISMATCHED_QUANT_TABLE, tblno);
      }
    }
-    /* Note: we do not copy the source's Huffman table assignments;
+    /* Note: we do not copy the source's entropy table assignments;
     * instead we rely on jpeg_set_colorspace to have made a suitable choice.
     */
  }
@@ -158,24 +166,13 @@ LOCAL(void)
 transencode_master_selection (j_compress_ptr cinfo,
                              jvirt_barray_ptr * coef_arrays)
 {
-  /* Although we don't actually use input_components for transcoding,
-   * jcmaster.c's initial_setup will complain if input_components is 0.
-   */
-  cinfo->input_components = 1;
  /* Initialize master control (includes parameter checking/processing) */
  jinit_c_master_control(cinfo, TRUE /* transcode only */);

  /* Entropy encoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef C_PROGRESSIVE_SUPPORTED
-      jinit_phuff_encoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
+  if (cinfo->arith_code)
+    jinit_arith_encoder(cinfo);
+  else {
    jinit_huff_encoder(cinfo);
  }

@@ -370,7 +367,7 @@ transencode_coef_controller (j_compress_ptr cinfo,
  coef = (my_coef_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = &coef->pub;
  coef->pub.start_pass = start_pass_coef;
  coef->pub.compress_data = compress_output;

@@ -381,7 +378,7 @@ transencode_coef_controller (j_compress_ptr cinfo,
  buffer = (JBLOCKROW)
    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
-  jzero_far((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
+  FMEMZERO((void FAR *) buffer, C_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK));
  for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
    coef->dummy_buffer[i] = buffer + i;
  }
--- a/3rdparty/libjpeg/jdapimin.c
+++ b/3rdparty/libjpeg/jdapimin.c
@@ -2,6 +2,7 @@
 * jdapimin.c
 *
 * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -185,8 +186,8 @@ default_decompress_parms (j_decompress_ptr cinfo)
  }

  /* Set defaults for other decompression parameters. */
-  cinfo->scale_num = 1;		/* 1:1 scaling */
-  cinfo->scale_denom = 1;
+  cinfo->scale_num = cinfo->block_size;		/* 1:1 scaling */
+  cinfo->scale_denom = cinfo->block_size;
  cinfo->output_gamma = 1.0;
  cinfo->buffered_image = FALSE;
  cinfo->raw_data_out = FALSE;
--- a/3rdparty/libjpeg/jdapistd.c
+++ b/3rdparty/libjpeg/jdapistd.c
@@ -202,7 +202,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
  }

  /* Verify that at least one iMCU row can be returned. */
-  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size;
+  lines_per_iMCU_row = cinfo->max_v_samp_factor * cinfo->min_DCT_v_scaled_size;
  if (max_lines < lines_per_iMCU_row)
    ERREXIT(cinfo, JERR_BUFFER_SIZE);

--- a/3rdparty/libjpeg/jdarith.c
+++ b/3rdparty/libjpeg/jdarith.c
@@ -0,0 +1,782 @@
+/*
+ * jdarith.c
+ *
+ * Developed 1997-2012 by Guido Vollbeding.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file contains portable arithmetic entropy decoding routines for JPEG
+ * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ *
+ * Both sequential and progressive modes are supported in this single module.
+ *
+ * Suspension is not currently supported in this module.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+
+
+/* Expanded entropy decoder object for arithmetic decoding. */
+
+typedef struct {
+  struct jpeg_entropy_decoder pub; /* public fields */
+
+  INT32 c;       /* C register, base of coding interval + input bit buffer */
+  INT32 a;               /* A register, normalized size of coding interval */
+  int ct;     /* bit shift counter, # of bits left in bit buffer part of C */
+                                                         /* init: ct = -16 */
+                                                         /* run: ct = 0..7 */
+                                                         /* error: ct = -1 */
+  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
+  int dc_context[MAX_COMPS_IN_SCAN]; /* context index for DC conditioning */
+
+  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
+
+  /* Pointers to statistics areas (these workspaces have image lifespan) */
+  unsigned char * dc_stats[NUM_ARITH_TBLS];
+  unsigned char * ac_stats[NUM_ARITH_TBLS];
+
+  /* Statistics bin for coding with fixed probability 0.5 */
+  unsigned char fixed_bin[4];
+} arith_entropy_decoder;
+
+typedef arith_entropy_decoder * arith_entropy_ptr;
+
+/* The following two definitions specify the allocation chunk size
+ * for the statistics area.
+ * According to sections F.1.4.4.1.3 and F.1.4.4.2, we need at least
+ * 49 statistics bins for DC, and 245 statistics bins for AC coding.
+ *
+ * We use a compact representation with 1 byte per statistics bin,
+ * thus the numbers directly represent byte sizes.
+ * This 1 byte per statistics bin contains the meaning of the MPS
+ * (more probable symbol) in the highest bit (mask 0x80), and the
+ * index into the probability estimation state machine table
+ * in the lower bits (mask 0x7F).
+ */
+
+#define DC_STAT_BINS 64
+#define AC_STAT_BINS 256
+
+
+LOCAL(int)
+get_byte (j_decompress_ptr cinfo)
+/* Read next input byte; we do not support suspension in this module. */
+{
+  struct jpeg_source_mgr * src = cinfo->src;
+
+  if (src->bytes_in_buffer == 0)
+    if (! (*src->fill_input_buffer) (cinfo))
+      ERREXIT(cinfo, JERR_CANT_SUSPEND);
+  src->bytes_in_buffer--;
+  return GETJOCTET(*src->next_input_byte++);
+}
+
+
+/*
+ * The core arithmetic decoding routine (common in JPEG and JBIG).
+ * This needs to go as fast as possible.
+ * Machine-dependent optimization facilities
+ * are not utilized in this portable implementation.
+ * However, this code should be fairly efficient and
+ * may be a good base for further optimizations anyway.
+ *
+ * Return value is 0 or 1 (binary decision).
+ *
+ * Note: I've changed the handling of the code base & bit
+ * buffer register C compared to other implementations
+ * based on the standards layout & procedures.
+ * While it also contains both the actual base of the
+ * coding interval (16 bits) and the next-bits buffer,
+ * the cut-point between these two parts is floating
+ * (instead of fixed) with the bit shift counter CT.
+ * Thus, we also need only one (variable instead of
+ * fixed size) shift for the LPS/MPS decision, and
+ * we can get away with any renormalization update
+ * of C (except for new data insertion, of course).
+ *
+ * I've also introduced a new scheme for accessing
+ * the probability estimation state machine table,
+ * derived from Markus Kuhn's JBIG implementation.
+ */
+
+LOCAL(int)
+arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+{
+  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register unsigned char nl, nm;
+  register INT32 qe, temp;
+  register int sv, data;
+
+  /* Renormalization & data input per section D.2.6 */
+  while (e->a < 0x8000L) {
+    if (--e->ct < 0) {
+      /* Need to fetch next data byte */
+      if (cinfo->unread_marker)
+        data = 0;		/* stuff zero data */
+      else {
+        data = get_byte(cinfo);	/* read next input byte */
+        if (data == 0xFF) {	/* zero stuff or marker code */
+          do data = get_byte(cinfo);
+          while (data == 0xFF);	/* swallow extra 0xFF bytes */
+          if (data == 0)
+            data = 0xFF;	/* discard stuffed zero byte */
+          else {
+            /* Note: Different from the Huffman decoder, hitting
+             * a marker while processing the compressed data
+             * segment is legal in arithmetic coding.
+             * The convention is to supply zero data
+             * then until decoding is complete.
+             */
+            cinfo->unread_marker = data;
+            data = 0;
+          }
+        }
+      }
+      e->c = (e->c << 8) | data; /* insert data into C register */
+      if ((e->ct += 8) < 0)	 /* update bit shift counter */
+        /* Need more initial bytes */
+        if (++e->ct == 0)
+          /* Got 2 initial bytes -> re-init A and exit loop */
+          e->a = 0x8000L; /* => e->a = 0x10000L after loop exit */
+    }
+    e->a <<= 1;
+  }
+
+  /* Fetch values from our compact representation of Table D.3(D.2):
+   * Qe values and probability estimation state machine
+   */
+  sv = *st;
+  qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
+  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
+
+  /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
+  temp = e->a - qe;
+  e->a = temp;
+  temp <<= e->ct;
+  if (e->c >= temp) {
+    e->c -= temp;
+    /* Conditional LPS (less probable symbol) exchange */
+    if (e->a < qe) {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    } else {
+      e->a = qe;
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    }
+  } else if (e->a < 0x8000L) {
+    /* Conditional MPS (more probable symbol) exchange */
+    if (e->a < qe) {
+      *st = (sv & 0x80) ^ nl;	/* Estimate_after_LPS */
+      sv ^= 0x80;		/* Exchange LPS/MPS */
+    } else {
+      *st = (sv & 0x80) ^ nm;	/* Estimate_after_MPS */
+    }
+  }
+
+  return sv >> 7;
+}
+
+
+/*
+ * Check for a restart marker & resynchronize decoder.
+ */
+
+LOCAL(void)
+process_restart (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci;
+  jpeg_component_info * compptr;
+
+  /* Advance past the RSTn marker */
+  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+    ERREXIT(cinfo, JERR_CANT_SUSPEND);
+
+  /* Re-initialize statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      /* Reset DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+        (cinfo->progressive_mode && cinfo->Ss)) {
+      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+    }
+  }
+
+  /* Reset arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Reset restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Arithmetic MCU decoding.
+ * Each of these routines decodes and returns one MCU's worth of
+ * arithmetic-compressed coefficients.
+ * The coefficients are reordered from zigzag order into natural array order,
+ * but are not dequantized.
+ *
+ * The i'th block of the MCU is stored into the block pointed to by
+ * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
+ */
+
+/*
+ * MCU decoding for DC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign;
+  int v, m;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    tbl = cinfo->cur_comp_info[ci]->dc_tbl_no;
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+        st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;			/* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+        entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+        entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+        entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
+    (*block)[0] = (JCOEF) (entropy->last_dc_val[ci] << cinfo->Al);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC initial scan (either spectral selection,
+ * or first pass of successive approximation).
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  unsigned char *st;
+  int tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+  /* Figure F.20: Decode_AC_coefficients */
+  k = cinfo->Ss - 1;
+  do {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    if (arith_decode(cinfo, st)) break;		/* EOB flag */
+    for (;;) {
+      k++;
+      if (arith_decode(cinfo, st + 1)) break;
+      st += 3;
+      if (k >= cinfo->Se) {
+        WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+        entropy->ct = -1;			/* spectral overflow */
+        return TRUE;
+      }
+    }
+    /* Figure F.21: Decoding nonzero value v */
+    /* Figure F.22: Decoding the sign of v */
+    sign = arith_decode(cinfo, entropy->fixed_bin);
+    st += 2;
+    /* Figure F.23: Decoding the magnitude category of v */
+    if ((m = arith_decode(cinfo, st)) != 0) {
+      if (arith_decode(cinfo, st)) {
+        m <<= 1;
+        st = entropy->ac_stats[tbl] +
+             (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;			/* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
+      }
+    }
+    v = m;
+    /* Figure F.24: Decoding the magnitude bit pattern of v */
+    st += 14;
+    while (m >>= 1)
+      if (arith_decode(cinfo, st)) v |= m;
+    v += 1; if (sign) v = -v;
+    /* Scale and output coefficient in natural (dezigzagged) order */
+    (*block)[natural_order[k]] = (JCOEF) (v << cinfo->Al);
+  } while (k < cinfo->Se);
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for DC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  unsigned char *st;
+  int p1, blkn;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  st = entropy->fixed_bin;	/* use fixed probability estimation */
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    /* Encoded data is simply the next bit of the two's-complement DC value */
+    if (arith_decode(cinfo, st))
+      MCU_data[blkn][0][0] |= p1;
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * MCU decoding for AC successive approximation refinement scan.
+ */
+
+METHODDEF(boolean)
+decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  JBLOCKROW block;
+  JCOEFPTR thiscoef;
+  unsigned char *st;
+  int tbl, k, kex;
+  int p1, m1;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* There is always only one block per MCU */
+  block = MCU_data[0];
+  tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
+
+  p1 = 1 << cinfo->Al;		/* 1 in the bit position being coded */
+  m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
+
+  /* Establish EOBx (previous stage end-of-block) index */
+  kex = cinfo->Se;
+  do {
+    if ((*block)[natural_order[kex]]) break;
+  } while (--kex);
+
+  k = cinfo->Ss - 1;
+  do {
+    st = entropy->ac_stats[tbl] + 3 * k;
+    if (k >= kex)
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+    for (;;) {
+      thiscoef = *block + natural_order[++k];
+      if (*thiscoef) {				/* previously nonzero coef */
+        if (arith_decode(cinfo, st + 2)) {
+          if (*thiscoef < 0)
+            *thiscoef += m1;
+          else
+            *thiscoef += p1;
+        }
+        break;
+      }
+      if (arith_decode(cinfo, st + 1)) {	/* newly nonzero coef */
+        if (arith_decode(cinfo, entropy->fixed_bin))
+          *thiscoef = m1;
+        else
+          *thiscoef = p1;
+        break;
+      }
+      st += 3;
+      if (k >= cinfo->Se) {
+        WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+        entropy->ct = -1;			/* spectral overflow */
+        return TRUE;
+      }
+    }
+  } while (k < cinfo->Se);
+
+  return TRUE;
+}
+
+
+/*
+ * Decode one MCU's worth of arithmetic-compressed coefficients.
+ */
+
+METHODDEF(boolean)
+decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  jpeg_component_info * compptr;
+  JBLOCKROW block;
+  unsigned char *st;
+  int blkn, ci, tbl, sign, k;
+  int v, m;
+  const int * natural_order;
+
+  /* Process restart marker if needed */
+  if (cinfo->restart_interval) {
+    if (entropy->restarts_to_go == 0)
+      process_restart(cinfo);
+    entropy->restarts_to_go--;
+  }
+
+  if (entropy->ct == -1) return TRUE;	/* if error do nothing */
+
+  natural_order = cinfo->natural_order;
+
+  /* Outer loop handles each block in the MCU */
+
+  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
+    block = MCU_data[blkn];
+    ci = cinfo->MCU_membership[blkn];
+    compptr = cinfo->cur_comp_info[ci];
+
+    /* Sections F.2.4.1 & F.1.4.4.1: Decoding of DC coefficients */
+
+    tbl = compptr->dc_tbl_no;
+
+    /* Table F.4: Point to statistics bin S0 for DC coefficient coding */
+    st = entropy->dc_stats[tbl] + entropy->dc_context[ci];
+
+    /* Figure F.19: Decode_DC_DIFF */
+    if (arith_decode(cinfo, st) == 0)
+      entropy->dc_context[ci] = 0;
+    else {
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, st + 1);
+      st += 2; st += sign;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+        st = entropy->dc_stats[tbl] + 20;	/* Table F.4: X1 = 20 */
+        while (arith_decode(cinfo, st)) {
+          if ((m <<= 1) == 0x8000) {
+            WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+            entropy->ct = -1;			/* magnitude overflow */
+            return TRUE;
+          }
+          st += 1;
+        }
+      }
+      /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
+      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+        entropy->dc_context[ci] = 0;		   /* zero diff category */
+      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+        entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
+      else
+        entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      entropy->last_dc_val[ci] += v;
+    }
+
+    (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+
+    /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
+
+    if (cinfo->lim_Se == 0) continue;
+    tbl = compptr->ac_tbl_no;
+    k = 0;
+
+    /* Figure F.20: Decode_AC_coefficients */
+    do {
+      st = entropy->ac_stats[tbl] + 3 * k;
+      if (arith_decode(cinfo, st)) break;	/* EOB flag */
+      for (;;) {
+        k++;
+        if (arith_decode(cinfo, st + 1)) break;
+        st += 3;
+        if (k >= cinfo->lim_Se) {
+          WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+          entropy->ct = -1;			/* spectral overflow */
+          return TRUE;
+        }
+      }
+      /* Figure F.21: Decoding nonzero value v */
+      /* Figure F.22: Decoding the sign of v */
+      sign = arith_decode(cinfo, entropy->fixed_bin);
+      st += 2;
+      /* Figure F.23: Decoding the magnitude category of v */
+      if ((m = arith_decode(cinfo, st)) != 0) {
+        if (arith_decode(cinfo, st)) {
+          m <<= 1;
+          st = entropy->ac_stats[tbl] +
+               (k <= cinfo->arith_ac_K[tbl] ? 189 : 217);
+          while (arith_decode(cinfo, st)) {
+            if ((m <<= 1) == 0x8000) {
+              WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
+              entropy->ct = -1;			/* magnitude overflow */
+              return TRUE;
+            }
+            st += 1;
+          }
+        }
+      }
+      v = m;
+      /* Figure F.24: Decoding the magnitude bit pattern of v */
+      st += 14;
+      while (m >>= 1)
+        if (arith_decode(cinfo, st)) v |= m;
+      v += 1; if (sign) v = -v;
+      (*block)[natural_order[k]] = (JCOEF) v;
+    } while (k < cinfo->lim_Se);
+  }
+
+  return TRUE;
+}
+
+
+/*
+ * Initialize for an arithmetic-compressed scan.
+ */
+
+METHODDEF(void)
+start_pass (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  int ci, tbl;
+  jpeg_component_info * compptr;
+
+  if (cinfo->progressive_mode) {
+    /* Validate progressive scan parameters */
+    if (cinfo->Ss == 0) {
+      if (cinfo->Se != 0)
+        goto bad;
+    } else {
+      /* need not check Ss/Se < 0 since they came from unsigned bytes */
+      if (cinfo->Se < cinfo->Ss || cinfo->Se > cinfo->lim_Se)
+        goto bad;
+      /* AC scans may have only one component */
+      if (cinfo->comps_in_scan != 1)
+        goto bad;
+    }
+    if (cinfo->Ah != 0) {
+      /* Successive approximation refinement scan: must have Al = Ah-1. */
+      if (cinfo->Ah-1 != cinfo->Al)
+        goto bad;
+    }
+    if (cinfo->Al > 13) {	/* need not check for < 0 */
+      bad:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+               cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+    }
+    /* Update progression status, and verify that scan order is legal.
+     * Note that inter-scan inconsistencies are treated as warnings
+     * not fatal errors ... not clear if this is right way to behave.
+     */
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
+      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
+        WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
+        int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
+        if (cinfo->Ah != expected)
+          WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
+        coef_bit_ptr[coefi] = cinfo->Al;
+      }
+    }
+    /* Select MCU decoding routine */
+    if (cinfo->Ah == 0) {
+      if (cinfo->Ss == 0)
+        entropy->pub.decode_mcu = decode_mcu_DC_first;
+      else
+        entropy->pub.decode_mcu = decode_mcu_AC_first;
+    } else {
+      if (cinfo->Ss == 0)
+        entropy->pub.decode_mcu = decode_mcu_DC_refine;
+      else
+        entropy->pub.decode_mcu = decode_mcu_AC_refine;
+    }
+  } else {
+    /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
+     * This ought to be an error condition, but we make it a warning.
+     */
+    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
+        (cinfo->Se < DCTSIZE2 && cinfo->Se != cinfo->lim_Se))
+      WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
+    /* Select MCU decoding routine */
+    entropy->pub.decode_mcu = decode_mcu;
+  }
+
+  /* Allocate & initialize requested statistics areas */
+  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+    compptr = cinfo->cur_comp_info[ci];
+    if (! cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
+      tbl = compptr->dc_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->dc_stats[tbl] == NULL)
+        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+      /* Initialize DC predictions to 0 */
+      entropy->last_dc_val[ci] = 0;
+      entropy->dc_context[ci] = 0;
+    }
+    if ((! cinfo->progressive_mode && cinfo->lim_Se) ||
+        (cinfo->progressive_mode && cinfo->Ss)) {
+      tbl = compptr->ac_tbl_no;
+      if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
+        ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
+      if (entropy->ac_stats[tbl] == NULL)
+        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
+          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+    }
+  }
+
+  /* Initialize arithmetic decoding variables */
+  entropy->c = 0;
+  entropy->a = 0;
+  entropy->ct = -16;	/* force reading 2 initial bytes to fill C */
+
+  /* Initialize restart counter */
+  entropy->restarts_to_go = cinfo->restart_interval;
+}
+
+
+/*
+ * Module initialization routine for arithmetic entropy decoding.
+ */
+
+GLOBAL(void)
+jinit_arith_decoder (j_decompress_ptr cinfo)
+{
+  arith_entropy_ptr entropy;
+  int i;
+
+  entropy = (arith_entropy_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                SIZEOF(arith_entropy_decoder));
+  cinfo->entropy = &entropy->pub;
+  entropy->pub.start_pass = start_pass;
+
+  /* Mark tables unallocated */
+  for (i = 0; i < NUM_ARITH_TBLS; i++) {
+    entropy->dc_stats[i] = NULL;
+    entropy->ac_stats[i] = NULL;
+  }
+
+  /* Initialize index for fixed probability estimation */
+  entropy->fixed_bin[0] = 113;
+
+  if (cinfo->progressive_mode) {
+    /* Create progression status table */
+    int *coef_bit_ptr, ci;
+    cinfo->coef_bits = (int (*)[DCTSIZE2])
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components*DCTSIZE2*SIZEOF(int));
+    coef_bit_ptr = & cinfo->coef_bits[0][0];
+    for (ci = 0; ci < cinfo->num_components; ci++)
+      for (i = 0; i < DCTSIZE2; i++)
+        *coef_bit_ptr++ = -1;
+  }
+}
--- a/3rdparty/libjpeg/jdatadst.c
+++ b/3rdparty/libjpeg/jdatadst.c
@@ -2,13 +2,14 @@
 * jdatadst.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains compression data destination routines for the case of
- * emitting JPEG data to a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * destination manager.
+ * emitting JPEG data to memory or to a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different destination manager.
 * IMPORTANT: we assume that fwrite() will correctly transcribe an array of
 * JOCTETs into 8-bit-wide elements on external storage.  If char is wider
 * than 8 bits on your machine, you may need to do some tweaking.
@@ -19,6 +20,11 @@
 #include "jpeglib.h"
 #include "jerror.h"

+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+

 /* Expanded data destination object for stdio output */

@@ -34,6 +40,21 @@ typedef my_destination_mgr * my_dest_ptr;
 #define OUTPUT_BUF_SIZE  4096	/* choose an efficiently fwrite'able size */


+/* Expanded data destination object for memory output */
+
+typedef struct {
+  struct jpeg_destination_mgr pub; /* public fields */
+
+  unsigned char ** outbuffer;	/* target buffer */
+  unsigned long * outsize;
+  unsigned char * newbuffer;	/* newly allocated buffer */
+  JOCTET * buffer;		/* start of buffer */
+  size_t bufsize;
+} my_mem_destination_mgr;
+
+typedef my_mem_destination_mgr * my_mem_dest_ptr;
+
+
 /*
 * Initialize destination --- called by jpeg_start_compress
 * before any data is actually written.
@@ -53,6 +74,12 @@ init_destination (j_compress_ptr cinfo)
  dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
 }

+METHODDEF(void)
+init_mem_destination (j_compress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+

 /*
 * Empty the output buffer --- called whenever buffer fills up.
@@ -92,6 +119,36 @@ empty_output_buffer (j_compress_ptr cinfo)
  return TRUE;
 }

+METHODDEF(boolean)
+empty_mem_output_buffer (j_compress_ptr cinfo)
+{
+  size_t nextsize;
+  JOCTET * nextbuffer;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  /* Try to allocate new buffer with double size */
+  nextsize = dest->bufsize * 2;
+  nextbuffer = (JOCTET *) malloc(nextsize);
+
+  if (nextbuffer == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+
+  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+
+  if (dest->newbuffer != NULL)
+    free(dest->newbuffer);
+
+  dest->newbuffer = nextbuffer;
+
+  dest->pub.next_output_byte = nextbuffer + dest->bufsize;
+  dest->pub.free_in_buffer = dest->bufsize;
+
+  dest->buffer = nextbuffer;
+  dest->bufsize = nextsize;
+
+  return TRUE;
+}
+

 /*
 * Terminate destination --- called by jpeg_finish_compress
@@ -119,6 +176,15 @@ term_destination (j_compress_ptr cinfo)
    ERREXIT(cinfo, JERR_FILE_WRITE);
 }

+METHODDEF(void)
+term_mem_destination (j_compress_ptr cinfo)
+{
+  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+
+  *dest->outbuffer = dest->buffer;
+  *dest->outsize = dest->bufsize - dest->pub.free_in_buffer;
+}
+

 /*
 * Prepare for output to a stdio stream.
@@ -149,3 +215,56 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE * outfile)
  dest->pub.term_destination = term_destination;
  dest->outfile = outfile;
 }
+
+
+/*
+ * Prepare for output to a memory buffer.
+ * The caller may supply an own initial buffer with appropriate size.
+ * Otherwise, or when the actual data output exceeds the given size,
+ * the library adapts the buffer size as necessary.
+ * The standard library functions malloc/free are used for allocating
+ * larger memory, so the buffer is available to the application after
+ * finishing compression, and then the application is responsible for
+ * freeing the requested memory.
+ * Note:  An initial buffer supplied by the caller is expected to be
+ * managed by the application.  The library does not free such buffer
+ * when allocating a larger buffer.
+ */
+
+GLOBAL(void)
+jpeg_mem_dest (j_compress_ptr cinfo,
+               unsigned char ** outbuffer, unsigned long * outsize)
+{
+  my_mem_dest_ptr dest;
+
+  if (outbuffer == NULL || outsize == NULL)	/* sanity check */
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+
+  /* The destination object is made permanent so that multiple JPEG images
+   * can be written to the same buffer without re-executing jpeg_mem_dest.
+   */
+  if (cinfo->dest == NULL) {	/* first time for this JPEG object? */
+    cinfo->dest = (struct jpeg_destination_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+                                  SIZEOF(my_mem_destination_mgr));
+  }
+
+  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest->pub.init_destination = init_mem_destination;
+  dest->pub.empty_output_buffer = empty_mem_output_buffer;
+  dest->pub.term_destination = term_mem_destination;
+  dest->outbuffer = outbuffer;
+  dest->outsize = outsize;
+  dest->newbuffer = NULL;
+
+  if (*outbuffer == NULL || *outsize == 0) {
+    /* Allocate initial buffer */
+    dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+    if (dest->newbuffer == NULL)
+      ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
+    *outsize = OUTPUT_BUF_SIZE;
+  }
+
+  dest->pub.next_output_byte = dest->buffer = *outbuffer;
+  dest->pub.free_in_buffer = dest->bufsize = *outsize;
+}
--- a/3rdparty/libjpeg/jdatasrc.c
+++ b/3rdparty/libjpeg/jdatasrc.c
@@ -2,13 +2,14 @@
 * jdatasrc.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains decompression data source routines for the case of
- * reading JPEG data from a file (or any stdio stream).  While these routines
- * are sufficient for most applications, some will want to use a different
- * source manager.
+ * reading JPEG data from memory or from a file (or any stdio stream).
+ * While these routines are sufficient for most applications,
+ * some will want to use a different source manager.
 * IMPORTANT: we assume that fread() will correctly transcribe an array of
 * JOCTETs from 8-bit-wide elements on external storage.  If char is wider
 * than 8 bits on your machine, you may need to do some tweaking.
@@ -52,6 +53,12 @@ init_source (j_decompress_ptr cinfo)
  src->start_of_file = TRUE;
 }

+METHODDEF(void)
+init_mem_source (j_decompress_ptr cinfo)
+{
+  /* no work necessary here */
+}
+

 /*
 * Fill the input buffer --- called whenever buffer is emptied.
@@ -111,6 +118,27 @@ fill_input_buffer (j_decompress_ptr cinfo)
  return TRUE;
 }

+METHODDEF(boolean)
+fill_mem_input_buffer (j_decompress_ptr cinfo)
+{
+  static const JOCTET mybuffer[4] = {
+    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+  };
+
+  /* The whole JPEG data is expected to reside in the supplied memory
+   * buffer, so any request for more data beyond the given buffer size
+   * is treated as an error.
+   */
+  WARNMS(cinfo, JWRN_JPEG_EOF);
+
+  /* Insert a fake EOI marker */
+
+  cinfo->src->next_input_byte = mybuffer;
+  cinfo->src->bytes_in_buffer = 2;
+
+  return TRUE;
+}
+

 /*
 * Skip data --- used to skip over a potentially large amount of
@@ -127,22 +155,22 @@ fill_input_buffer (j_decompress_ptr cinfo)
 METHODDEF(void)
 skip_input_data (j_decompress_ptr cinfo, long num_bytes)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  struct jpeg_source_mgr * src = cinfo->src;

  /* Just a dumb implementation for now.  Could use fseek() except
   * it doesn't work on pipes.  Not clear that being smart is worth
   * any trouble anyway --- large skips are infrequent.
   */
  if (num_bytes > 0) {
-    while (num_bytes > (long) src->pub.bytes_in_buffer) {
-      num_bytes -= (long) src->pub.bytes_in_buffer;
-      (void) fill_input_buffer(cinfo);
+    while (num_bytes > (long) src->bytes_in_buffer) {
+      num_bytes -= (long) src->bytes_in_buffer;
+      (void) (*src->fill_input_buffer) (cinfo);
      /* note we assume that fill_input_buffer will never return FALSE,
       * so suspension need not be handled.
       */
    }
-    src->pub.next_input_byte += (size_t) num_bytes;
-    src->pub.bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t) num_bytes;
+    src->bytes_in_buffer -= (size_t) num_bytes;
  }
 }

@@ -210,3 +238,38 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE * infile)
  src->pub.bytes_in_buffer = 0; /* forces fill_input_buffer on first read */
  src->pub.next_input_byte = NULL; /* until buffer loaded */
 }
+
+
+/*
+ * Prepare for input from a supplied memory buffer.
+ * The buffer must contain the whole JPEG data.
+ */
+
+GLOBAL(void)
+jpeg_mem_src (j_decompress_ptr cinfo,
+              unsigned char * inbuffer, unsigned long insize)
+{
+  struct jpeg_source_mgr * src;
+
+  if (inbuffer == NULL || insize == 0)	/* Treat empty input as fatal error */
+    ERREXIT(cinfo, JERR_INPUT_EMPTY);
+
+  /* The source object is made permanent so that a series of JPEG images
+   * can be read from the same buffer by calling jpeg_mem_src only before
+   * the first one.
+   */
+  if (cinfo->src == NULL) {	/* first time for this JPEG object? */
+    cinfo->src = (struct jpeg_source_mgr *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+                                  SIZEOF(struct jpeg_source_mgr));
+  }
+
+  src = cinfo->src;
+  src->init_source = init_mem_source;
+  src->fill_input_buffer = fill_mem_input_buffer;
+  src->skip_input_data = skip_input_data;
+  src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
+  src->term_source = term_source;
+  src->bytes_in_buffer = (size_t) insize;
+  src->next_input_byte = (JOCTET *) inbuffer;
+}
--- a/3rdparty/libjpeg/jdcoefct.c
+++ b/3rdparty/libjpeg/jdcoefct.c
@@ -2,6 +2,7 @@
 * jdcoefct.c
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2002-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -162,7 +163,8 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
    for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
         MCU_col_num++) {
      /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void FAR *) coef->MCU_buffer[0],
+      if (cinfo->lim_Se)	/* can bypass in DC only case */
+        FMEMZERO((void FAR *) coef->MCU_buffer[0],
                 (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
        /* Suspension forced; update state counters and exit */
@@ -187,7 +189,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
        useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
                                                    : compptr->last_col_width;
        output_ptr = output_buf[compptr->component_index] +
-      yoffset * compptr->DCT_scaled_size;
+          yoffset * compptr->DCT_v_scaled_size;
        start_col = MCU_col_num * compptr->MCU_sample_width;
        for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
          if (cinfo->input_iMCU_row < last_iMCU_row ||
@@ -197,11 +199,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
              (*inverse_DCT) (cinfo, compptr,
                              (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
                              output_ptr, output_col);
-          output_col += compptr->DCT_scaled_size;
+              output_col += compptr->DCT_h_scaled_size;
            }
          }
          blkn += compptr->MCU_width;
-      output_ptr += compptr->DCT_scaled_size;
+          output_ptr += compptr->DCT_v_scaled_size;
        }
      }
    }
@@ -362,9 +364,9 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
                        output_ptr, output_col);
        buffer_ptr++;
-    output_col += compptr->DCT_scaled_size;
+        output_col += compptr->DCT_h_scaled_size;
      }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
    }
  }

@@ -654,9 +656,9 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
        DC4 = DC5; DC5 = DC6;
        DC7 = DC8; DC8 = DC9;
        buffer_ptr++, prev_block_row++, next_block_row++;
-    output_col += compptr->DCT_scaled_size;
+        output_col += compptr->DCT_h_scaled_size;
      }
-      output_ptr += compptr->DCT_scaled_size;
+      output_ptr += compptr->DCT_v_scaled_size;
    }
  }

@@ -729,6 +731,9 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
    for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
      coef->MCU_buffer[i] = buffer + i;
    }
+    if (cinfo->lim_Se == 0)	/* DC only case: want to bypass later */
+      FMEMZERO((void FAR *) buffer,
+               (size_t) (D_MAX_BLOCKS_IN_MCU * SIZEOF(JBLOCK)));
    coef->pub.consume_data = dummy_consume_data;
    coef->pub.decompress_data = decompress_onepass;
    coef->pub.coef_arrays = NULL; /* flag for no virtual arrays */
--- a/3rdparty/libjpeg/jdcolor.c
+++ b/3rdparty/libjpeg/jdcolor.c
@@ -2,6 +2,7 @@
 * jdcolor.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2011-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -23,20 +24,28 @@ typedef struct {
  int * Cb_b_tab;		/* => table for Cb to B conversion */
  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */
+
+  /* Private state for RGB->Y conversion */
+  INT32 * rgb_y_tab;		/* => table for RGB to Y conversion */
 } my_color_deconverter;

 typedef my_color_deconverter * my_cconvert_ptr;


 /**************** YCbCr -> RGB conversion: most common case **************/
+/****************   RGB -> Y   conversion: less common case **************/

 /*
 * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
 * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
 * The conversion equations to be implemented are therefore
+ *
 *	R = Y                + 1.40200 * Cr
 *	G = Y - 0.34414 * Cb - 0.71414 * Cr
 *	B = Y + 1.77200 * Cb
+ *
+ *	Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
 * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
 * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
 *
@@ -61,6 +70,18 @@ typedef my_color_deconverter * my_cconvert_ptr;
 #define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
 #define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))

+/* We allocate one big table for RGB->Y conversion and divide it up into
+ * three parts, instead of doing three alloc_small requests.  This lets us
+ * use a single table base address, which can be held in a register in the
+ * inner loops on many machines (more than can hold all three addresses,
+ * anyway).
+ */
+
+#define R_Y_OFF		0			/* offset to R => Y section */
+#define G_Y_OFF		(1*(MAXJSAMPLE+1))	/* offset to G => Y section */
+#define B_Y_OFF		(2*(MAXJSAMPLE+1))	/* etc. */
+#define TABLE_SIZE	(3*(MAXJSAMPLE+1))
+

 /*
 * Initialize tables for YCC->RGB colorspace conversion.
@@ -160,6 +181,178 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
 /**************** Cases other than YCbCr -> RGB **************/


+/*
+ * Initialize for RGB->grayscale colorspace conversion.
+ */
+
+LOCAL(void)
+build_rgb_y_table (j_decompress_ptr cinfo)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  INT32 * rgb_y_tab;
+  INT32 i;
+
+  /* Allocate and fill in the conversion tables. */
+  cconvert->rgb_y_tab = rgb_y_tab = (INT32 *)
+    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                                (TABLE_SIZE * SIZEOF(INT32)));
+
+  for (i = 0; i <= MAXJSAMPLE; i++) {
+    rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
+    rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
+    rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+  }
+}
+
+
+/*
+ * Convert RGB to grayscale.
+ */
+
+METHODDEF(void)
+rgb_gray_convert (j_decompress_ptr cinfo,
+                  JSAMPIMAGE input_buf, JDIMENSION input_row,
+                  JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register INT32 * ctab = cconvert->rgb_y_tab;
+  register int r, g, b;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr0[col]);
+      g = GETJSAMPLE(inptr1[col]);
+      b = GETJSAMPLE(inptr2[col]);
+      /* Y */
+      outptr[col] = (JSAMPLE)
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
+    }
+  }
+}
+
+
+/*
+ * [R-G,G,B-G] to [R,G,B] conversion with modulo calculation
+ * (inverse color transform).
+ */
+
+METHODDEF(void)
+rgb1_rgb_convert (j_decompress_ptr cinfo,
+                  JSAMPIMAGE input_buf, JDIMENSION input_row,
+                  JSAMPARRAY output_buf, int num_rows)
+{
+  register int r, g, b;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr0[col]);
+      g = GETJSAMPLE(inptr1[col]);
+      b = GETJSAMPLE(inptr2[col]);
+      /* Assume that MAXJSAMPLE+1 is a power of 2, so that the MOD
+       * (modulo) operator is equivalent to the bitmask operator AND.
+       */
+      outptr[RGB_RED]   = (JSAMPLE) ((r + g - CENTERJSAMPLE) & MAXJSAMPLE);
+      outptr[RGB_GREEN] = (JSAMPLE) g;
+      outptr[RGB_BLUE]  = (JSAMPLE) ((b + g - CENTERJSAMPLE) & MAXJSAMPLE);
+      outptr += RGB_PIXELSIZE;
+    }
+  }
+}
+
+
+/*
+ * [R-G,G,B-G] to grayscale conversion with modulo calculation
+ * (inverse color transform).
+ */
+
+METHODDEF(void)
+rgb1_gray_convert (j_decompress_ptr cinfo,
+                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                   JSAMPARRAY output_buf, int num_rows)
+{
+  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  register INT32 * ctab = cconvert->rgb_y_tab;
+  register int r, g, b;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      r = GETJSAMPLE(inptr0[col]);
+      g = GETJSAMPLE(inptr1[col]);
+      b = GETJSAMPLE(inptr2[col]);
+      /* Assume that MAXJSAMPLE+1 is a power of 2, so that the MOD
+       * (modulo) operator is equivalent to the bitmask operator AND.
+       */
+      r = (r + g - CENTERJSAMPLE) & MAXJSAMPLE;
+      b = (b + g - CENTERJSAMPLE) & MAXJSAMPLE;
+      /* Y */
+      outptr[col] = (JSAMPLE)
+                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
+                 >> SCALEBITS);
+    }
+  }
+}
+
+
+/*
+ * No colorspace change, but conversion from separate-planes
+ * to interleaved representation.
+ */
+
+METHODDEF(void)
+rgb_convert (j_decompress_ptr cinfo,
+             JSAMPIMAGE input_buf, JDIMENSION input_row,
+             JSAMPARRAY output_buf, int num_rows)
+{
+  register JSAMPROW outptr;
+  register JSAMPROW inptr0, inptr1, inptr2;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    for (col = 0; col < num_cols; col++) {
+      /* We can dispense with GETJSAMPLE() here */
+      outptr[RGB_RED]   = inptr0[col];
+      outptr[RGB_GREEN] = inptr1[col];
+      outptr[RGB_BLUE]  = inptr2[col];
+      outptr += RGB_PIXELSIZE;
+    }
+  }
+}
+
+
 /*
 * Color conversion for no colorspace change: just copy the data,
 * converting from separate-planes to interleaved representation.
@@ -170,19 +363,20 @@ null_convert (j_decompress_ptr cinfo,
              JSAMPIMAGE input_buf, JDIMENSION input_row,
              JSAMPARRAY output_buf, int num_rows)
 {
-  register JSAMPROW inptr, outptr;
-  register JDIMENSION count;
-  register int num_components = cinfo->num_components;
-  JDIMENSION num_cols = cinfo->output_width;
  int ci;
+  register int nc = cinfo->num_components;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr;
+  register JDIMENSION col;
+  JDIMENSION num_cols = cinfo->output_width;

  while (--num_rows >= 0) {
-    for (ci = 0; ci < num_components; ci++) {
+    for (ci = 0; ci < nc; ci++) {
      inptr = input_buf[ci][input_row];
      outptr = output_buf[0] + ci;
-      for (count = num_cols; count > 0; count--) {
+      for (col = 0; col < num_cols; col++) {
        *outptr = *inptr++;	/* needn't bother with GETJSAMPLE() here */
-    outptr += num_components;
+        outptr += nc;
      }
    }
    input_row++;
@@ -218,7 +412,8 @@ gray_rgb_convert (j_decompress_ptr cinfo,
                  JSAMPIMAGE input_buf, JDIMENSION input_row,
                  JSAMPARRAY output_buf, int num_rows)
 {
-  register JSAMPROW inptr, outptr;
+  register JSAMPROW outptr;
+  register JSAMPROW inptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;

@@ -309,7 +504,7 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_color_deconverter));
-  cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
+  cinfo->cconvert = &cconvert->pub;
  cconvert->pub.start_pass = start_pass_dcolor;

  /* Make sure num_components agrees with jpeg_color_space */
@@ -337,6 +532,10 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
    break;
  }

+  /* Support color transform only for RGB colorspace */
+  if (cinfo->color_transform && cinfo->jpeg_color_space != JCS_RGB)
+    ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+
  /* Set out_color_components and conversion method based on requested space.
   * Also clear the component_needed flags for any unused components,
   * so that earlier pipeline stages can avoid useless computation.
@@ -351,6 +550,19 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
      /* For color->grayscale conversion, only the Y (0) component is needed */
      for (ci = 1; ci < cinfo->num_components; ci++)
        cinfo->comp_info[ci].component_needed = FALSE;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      switch (cinfo->color_transform) {
+      case JCT_NONE:
+        cconvert->pub.color_convert = rgb_gray_convert;
+        break;
+      case JCT_SUBTRACT_GREEN:
+        cconvert->pub.color_convert = rgb1_gray_convert;
+        break;
+      default:
+        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+        break;
+      }
+      build_rgb_y_table(cinfo);
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;
@@ -362,8 +574,18 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
      build_ycc_rgb_table(cinfo);
    } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
      cconvert->pub.color_convert = gray_rgb_convert;
-    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
-      cconvert->pub.color_convert = null_convert;
+    } else if (cinfo->jpeg_color_space == JCS_RGB) {
+      switch (cinfo->color_transform) {
+      case JCT_NONE:
+        cconvert->pub.color_convert = rgb_convert;
+        break;
+      case JCT_SUBTRACT_GREEN:
+        cconvert->pub.color_convert = rgb1_rgb_convert;
+        break;
+      default:
+        ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+        break;
+      }
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;
--- a/3rdparty/libjpeg/jdct.h
+++ b/3rdparty/libjpeg/jdct.h
@@ -14,11 +14,16 @@


 /*
- * A forward DCT routine is given a pointer to a work area of type DCTELEM[];
- * the DCT is to be performed in-place in that buffer.  Type DCTELEM is int
- * for 8-bit samples, INT32 for 12-bit samples.  (NOTE: Floating-point DCT
- * implementations use an array of type FAST_FLOAT, instead.)
- * The DCT inputs are expected to be signed (range +-CENTERJSAMPLE).
+ * A forward DCT routine is given a pointer to an input sample array and
+ * a pointer to a work area of type DCTELEM[]; the DCT is to be performed
+ * in-place in that buffer.  Type DCTELEM is int for 8-bit samples, INT32
+ * for 12-bit samples.  (NOTE: Floating-point DCT implementations use an
+ * array of type FAST_FLOAT, instead.)
+ * The input data is to be fetched from the sample array starting at a
+ * specified column.  (Any row offset needed will be applied to the array
+ * pointer before it is passed to the FDCT code.)
+ * Note that the number of samples fetched by the FDCT routine is
+ * DCT_h_scaled_size * DCT_v_scaled_size.
 * The DCT outputs are returned scaled up by a factor of 8; they therefore
 * have a range of +-8K for 8-bit data, +-128K for 12-bit data.  This
 * convention improves accuracy in integer implementations and saves some
@@ -32,8 +37,12 @@ typedef int DCTELEM;		/* 16 or 32 bits is fine */
 typedef INT32 DCTELEM;		/* must have 32 bits */
 #endif

-typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data));
-typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
+typedef JMETHOD(void, forward_DCT_method_ptr, (DCTELEM * data,
+                                               JSAMPARRAY sample_data,
+                                               JDIMENSION start_col));
+typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data,
+                                             JSAMPARRAY sample_data,
+                                             JDIMENSION start_col));


 /*
@@ -44,7 +53,7 @@ typedef JMETHOD(void, float_DCT_method_ptr, (FAST_FLOAT * data));
 * sample array starting at a specified column.  (Any row offset needed will
 * be applied to the array pointer before it is passed to the IDCT code.)
 * Note that the number of samples emitted by the IDCT routine is
- * DCT_scaled_size * DCT_scaled_size.
+ * DCT_h_scaled_size * DCT_v_scaled_size.
 */

 /* typedef inverse_DCT_method_ptr is declared in jpegint.h */
@@ -84,19 +93,143 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 #define jpeg_fdct_islow		jFDislow
 #define jpeg_fdct_ifast		jFDifast
 #define jpeg_fdct_float		jFDfloat
+#define jpeg_fdct_7x7		jFD7x7
+#define jpeg_fdct_6x6		jFD6x6
+#define jpeg_fdct_5x5		jFD5x5
+#define jpeg_fdct_4x4		jFD4x4
+#define jpeg_fdct_3x3		jFD3x3
+#define jpeg_fdct_2x2		jFD2x2
+#define jpeg_fdct_1x1		jFD1x1
+#define jpeg_fdct_9x9		jFD9x9
+#define jpeg_fdct_10x10		jFD10x10
+#define jpeg_fdct_11x11		jFD11x11
+#define jpeg_fdct_12x12		jFD12x12
+#define jpeg_fdct_13x13		jFD13x13
+#define jpeg_fdct_14x14		jFD14x14
+#define jpeg_fdct_15x15		jFD15x15
+#define jpeg_fdct_16x16		jFD16x16
+#define jpeg_fdct_16x8		jFD16x8
+#define jpeg_fdct_14x7		jFD14x7
+#define jpeg_fdct_12x6		jFD12x6
+#define jpeg_fdct_10x5		jFD10x5
+#define jpeg_fdct_8x4		jFD8x4
+#define jpeg_fdct_6x3		jFD6x3
+#define jpeg_fdct_4x2		jFD4x2
+#define jpeg_fdct_2x1		jFD2x1
+#define jpeg_fdct_8x16		jFD8x16
+#define jpeg_fdct_7x14		jFD7x14
+#define jpeg_fdct_6x12		jFD6x12
+#define jpeg_fdct_5x10		jFD5x10
+#define jpeg_fdct_4x8		jFD4x8
+#define jpeg_fdct_3x6		jFD3x6
+#define jpeg_fdct_2x4		jFD2x4
+#define jpeg_fdct_1x2		jFD1x2
 #define jpeg_idct_islow		jRDislow
 #define jpeg_idct_ifast		jRDifast
 #define jpeg_idct_float		jRDfloat
+#define jpeg_idct_7x7		jRD7x7
+#define jpeg_idct_6x6		jRD6x6
+#define jpeg_idct_5x5		jRD5x5
 #define jpeg_idct_4x4		jRD4x4
+#define jpeg_idct_3x3		jRD3x3
 #define jpeg_idct_2x2		jRD2x2
 #define jpeg_idct_1x1		jRD1x1
+#define jpeg_idct_9x9		jRD9x9
+#define jpeg_idct_10x10		jRD10x10
+#define jpeg_idct_11x11		jRD11x11
+#define jpeg_idct_12x12		jRD12x12
+#define jpeg_idct_13x13		jRD13x13
+#define jpeg_idct_14x14		jRD14x14
+#define jpeg_idct_15x15		jRD15x15
+#define jpeg_idct_16x16		jRD16x16
+#define jpeg_idct_16x8		jRD16x8
+#define jpeg_idct_14x7		jRD14x7
+#define jpeg_idct_12x6		jRD12x6
+#define jpeg_idct_10x5		jRD10x5
+#define jpeg_idct_8x4		jRD8x4
+#define jpeg_idct_6x3		jRD6x3
+#define jpeg_idct_4x2		jRD4x2
+#define jpeg_idct_2x1		jRD2x1
+#define jpeg_idct_8x16		jRD8x16
+#define jpeg_idct_7x14		jRD7x14
+#define jpeg_idct_6x12		jRD6x12
+#define jpeg_idct_5x10		jRD5x10
+#define jpeg_idct_4x8		jRD4x8
+#define jpeg_idct_3x6		jRD3x8
+#define jpeg_idct_2x4		jRD2x4
+#define jpeg_idct_1x2		jRD1x2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */

 /* Extern declarations for the forward and inverse DCT routines. */

-EXTERN(void) jpeg_fdct_islow JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_ifast JPP((DCTELEM * data));
-EXTERN(void) jpeg_fdct_float JPP((FAST_FLOAT * data));
+EXTERN(void) jpeg_fdct_islow
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_ifast
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_float
+    JPP((FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_9x9
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_11x11
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_13x13
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_15x15
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_16x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_14x7
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_12x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_10x5
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x3
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x1
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_8x16
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_7x14
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_6x12
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_5x10
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_4x8
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_3x6
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_2x4
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));
+EXTERN(void) jpeg_fdct_1x2
+    JPP((DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col));

 EXTERN(void) jpeg_idct_islow
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
@@ -107,15 +240,99 @@ EXTERN(void) jpeg_idct_ifast
 EXTERN(void) jpeg_idct_float
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_4x4
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_2x2
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
 EXTERN(void) jpeg_idct_1x1
    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_9x9
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_11x11
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_13x13
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_15x15
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_16x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_14x7
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_12x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_10x5
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x3
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x1
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_8x16
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_7x14
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_6x12
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_5x10
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_4x8
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_3x6
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_2x4
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));
+EXTERN(void) jpeg_idct_1x2
+    JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr,
+         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col));


 /*
--- a/3rdparty/libjpeg/jddctmgr.c
+++ b/3rdparty/libjpeg/jddctmgr.c
@@ -2,6 +2,7 @@
 * jddctmgr.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2002-2010 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -98,22 +99,134 @@ start_pass (j_decompress_ptr cinfo)
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
    /* Select the proper IDCT routine for this component's scaling */
-    switch (compptr->DCT_scaled_size) {
+    switch ((compptr->DCT_h_scaled_size << 8) + compptr->DCT_v_scaled_size) {
 #ifdef IDCT_SCALING_SUPPORTED
-    case 1:
+    case ((1 << 8) + 1):
      method_ptr = jpeg_idct_1x1;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
      break;
-    case 2:
+    case ((2 << 8) + 2):
      method_ptr = jpeg_idct_2x2;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
      break;
-    case 4:
+    case ((3 << 8) + 3):
+      method_ptr = jpeg_idct_3x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 4):
      method_ptr = jpeg_idct_4x4;
-      method = JDCT_ISLOW;	/* jidctred uses islow-style table */
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 5):
+      method_ptr = jpeg_idct_5x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 6):
+      method_ptr = jpeg_idct_6x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 7):
+      method_ptr = jpeg_idct_7x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((9 << 8) + 9):
+      method_ptr = jpeg_idct_9x9;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 10):
+      method_ptr = jpeg_idct_10x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((11 << 8) + 11):
+      method_ptr = jpeg_idct_11x11;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 12):
+      method_ptr = jpeg_idct_12x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((13 << 8) + 13):
+      method_ptr = jpeg_idct_13x13;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 14):
+      method_ptr = jpeg_idct_14x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((15 << 8) + 15):
+      method_ptr = jpeg_idct_15x15;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 16):
+      method_ptr = jpeg_idct_16x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((16 << 8) + 8):
+      method_ptr = jpeg_idct_16x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((14 << 8) + 7):
+      method_ptr = jpeg_idct_14x7;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((12 << 8) + 6):
+      method_ptr = jpeg_idct_12x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((10 << 8) + 5):
+      method_ptr = jpeg_idct_10x5;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 4):
+      method_ptr = jpeg_idct_8x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 3):
+      method_ptr = jpeg_idct_6x3;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 2):
+      method_ptr = jpeg_idct_4x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 1):
+      method_ptr = jpeg_idct_2x1;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((8 << 8) + 16):
+      method_ptr = jpeg_idct_8x16;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((7 << 8) + 14):
+      method_ptr = jpeg_idct_7x14;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((6 << 8) + 12):
+      method_ptr = jpeg_idct_6x12;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((5 << 8) + 10):
+      method_ptr = jpeg_idct_5x10;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((4 << 8) + 8):
+      method_ptr = jpeg_idct_4x8;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((3 << 8) + 6):
+      method_ptr = jpeg_idct_3x6;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((2 << 8) + 4):
+      method_ptr = jpeg_idct_2x4;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
+      break;
+    case ((1 << 8) + 2):
+      method_ptr = jpeg_idct_1x2;
+      method = JDCT_ISLOW;	/* jidctint uses islow-style table */
      break;
 #endif
-    case DCTSIZE:
+    case ((DCTSIZE << 8) + DCTSIZE):
      switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
      case JDCT_ISLOW:
@@ -139,7 +252,8 @@ start_pass (j_decompress_ptr cinfo)
      }
      break;
    default:
-      ERREXIT1(cinfo, JERR_BAD_DCTSIZE, compptr->DCT_scaled_size);
+      ERREXIT2(cinfo, JERR_BAD_DCTSIZE,
+               compptr->DCT_h_scaled_size, compptr->DCT_v_scaled_size);
      break;
    }
    idct->pub.inverse_DCT[ci] = method_ptr;
@@ -211,6 +325,7 @@ start_pass (j_decompress_ptr cinfo)
         * coefficients scaled by scalefactor[row]*scalefactor[col], where
         *   scalefactor[0] = 1
         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
+         * We apply a further scale factor of 1/8.
         */
        FLOAT_MULT_TYPE * fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
        int row, col;
@@ -224,7 +339,7 @@ start_pass (j_decompress_ptr cinfo)
          for (col = 0; col < DCTSIZE; col++) {
            fmtbl[i] = (FLOAT_MULT_TYPE)
              ((double) qtbl->quantval[i] *
-           aanscalefactor[row] * aanscalefactor[col]);
+               aanscalefactor[row] * aanscalefactor[col] * 0.125);
            i++;
          }
        }
--- a/3rdparty/libjpeg/jdhuff.c
+++ b/3rdparty/libjpeg/jdhuff.c
--- a/3rdparty/libjpeg/jdhuff.h
+++ b/3rdparty/libjpeg/jdhuff.h
@@ -1,201 +0,0 @@
-/*
- * jdhuff.h
- *
- * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for Huffman entropy decoding routines
- * that are shared between the sequential decoder (jdhuff.c) and the
- * progressive decoder (jdphuff.c).  No other modules need to see these.
- */
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jpeg_make_d_derived_tbl	jMkDDerived
-#define jpeg_fill_bit_buffer	jFilBitBuf
-#define jpeg_huff_decode	jHufDecode
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
-/* Derived data constructed for each Huffman table */
-
-#define HUFF_LOOKAHEAD	8	/* # of bits of lookahead */
-
-typedef struct {
-  /* Basic tables: (element [0] of each array is unused) */
-  INT32 maxcode[18];		/* largest code of length k (-1 if none) */
-  /* (maxcode[17] is a sentinel to ensure jpeg_huff_decode terminates) */
-  INT32 valoffset[17];		/* huffval[] offset for codes of length k */
-  /* valoffset[k] = huffval[] index of 1st symbol of code length k, less
-   * the smallest code of length k; so given a code of length k, the
-   * corresponding symbol is huffval[code + valoffset[k]]
-   */
-
-  /* Link to public Huffman table (needed only in jpeg_huff_decode) */
-  JHUFF_TBL *pub;
-
-  /* Lookahead tables: indexed by the next HUFF_LOOKAHEAD bits of
-   * the input data stream.  If the next Huffman code is no more
-   * than HUFF_LOOKAHEAD bits long, we can obtain its length and
-   * the corresponding symbol directly from these tables.
-   */
-  int look_nbits[1<<HUFF_LOOKAHEAD]; /* # bits, or 0 if too long */
-  UINT8 look_sym[1<<HUFF_LOOKAHEAD]; /* symbol, or unused */
-} d_derived_tbl;
-
-/* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-    JPP((j_decompress_ptr cinfo, boolean isDC, int tblno,
-         d_derived_tbl ** pdtbl));
-
-
-/*
- * Fetching the next N bits from the input stream is a time-critical operation
- * for the Huffman decoders.  We implement it with a combination of inline
- * macros and out-of-line subroutines.  Note that N (the number of bits
- * demanded at one time) never exceeds 15 for JPEG use.
- *
- * We read source bytes into get_buffer and dole out bits as needed.
- * If get_buffer already contains enough bits, they are fetched in-line
- * by the macros CHECK_BIT_BUFFER and GET_BITS.  When there aren't enough
- * bits, jpeg_fill_bit_buffer is called; it will attempt to fill get_buffer
- * as full as possible (not just to the number of bits needed; this
- * prefetching reduces the overhead cost of calling jpeg_fill_bit_buffer).
- * Note that jpeg_fill_bit_buffer may return FALSE to indicate suspension.
- * On TRUE return, jpeg_fill_bit_buffer guarantees that get_buffer contains
- * at least the requested number of bits --- dummy zeroes are inserted if
- * necessary.
- */
-
-typedef INT32 bit_buf_type;	/* type of bit-extraction buffer */
-#define BIT_BUF_SIZE  32	/* size of buffer in bits */
-
-/* If long is > 32 bits on your machine, and shifting/masking longs is
- * reasonably fast, making bit_buf_type be long and setting BIT_BUF_SIZE
- * appropriately should be a win.  Unfortunately we can't define the size
- * with something like  #define BIT_BUF_SIZE (sizeof(bit_buf_type)*8)
- * because not all machines measure sizeof in 8-bit bytes.
- */
-
-typedef struct {		/* Bitreading state saved across MCUs */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-} bitread_perm_state;
-
-typedef struct {		/* Bitreading working state within an MCU */
-  /* Current data source location */
-  /* We need a copy, rather than munging the original, in case of suspension */
-  const JOCTET * next_input_byte; /* => next byte to read from source */
-  size_t bytes_in_buffer;	/* # of bytes remaining in source buffer */
-  /* Bit input buffer --- note these values are kept in register variables,
-   * not in this struct, inside the inner loops.
-   */
-  bit_buf_type get_buffer;	/* current bit-extraction buffer */
-  int bits_left;		/* # of unused bits in it */
-  /* Pointer needed by jpeg_fill_bit_buffer. */
-  j_decompress_ptr cinfo;	/* back link to decompress master record */
-} bitread_working_state;
-
-/* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-    register bit_buf_type get_buffer;  \
-    register int bits_left;  \
-    bitread_working_state br_state
-
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-    br_state.cinfo = cinfop; \
-    br_state.next_input_byte = cinfop->src->next_input_byte; \
-    br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-    get_buffer = permstate.get_buffer; \
-    bits_left = permstate.bits_left;
-
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-    cinfop->src->next_input_byte = br_state.next_input_byte; \
-    cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-    permstate.get_buffer = get_buffer; \
-    permstate.bits_left = bits_left
-
-/*
- * These macros provide the in-line portion of bit fetching.
- * Use CHECK_BIT_BUFFER to ensure there are N bits in get_buffer
- * before using GET_BITS, PEEK_BITS, or DROP_BITS.
- * The variables get_buffer and bits_left are assumed to be locals,
- * but the state struct might not be (jpeg_huff_decode needs this).
- *	CHECK_BIT_BUFFER(state,n,action);
- *		Ensure there are N bits in get_buffer; if suspend, take action.
- *      val = GET_BITS(n);
- *		Fetch next N bits.
- *      val = PEEK_BITS(n);
- *		Fetch next N bits without removing them from the buffer.
- *	DROP_BITS(n);
- *		Discard next N bits.
- * The value N should be a simple variable, not an expression, because it
- * is evaluated multiple times.
- */
-
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-    { if (bits_left < (nbits)) {  \
-        if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-          { action; }  \
-        get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
-
-#define GET_BITS(nbits) \
-    (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
-
-#define PEEK_BITS(nbits) \
-    (((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
-
-#define DROP_BITS(nbits) \
-    (bits_left -= (nbits))
-
-/* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-    JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-         register int bits_left, int nbits));
-
-
-/*
- * Code for extracting next Huffman-coded symbol from input bit stream.
- * Again, this is time-critical and we make the main paths be macros.
- *
- * We use a lookahead table to process codes of up to HUFF_LOOKAHEAD bits
- * without looping.  Usually, more than 95% of the Huffman codes will be 8
- * or fewer bits long.  The few overlength codes are handled with a loop,
- * which need not be inline code.
- *
- * Notes about the HUFF_DECODE macro:
- * 1. Near the end of the data segment, we may fail to get enough bits
- *    for a lookahead.  In that case, we do it the hard way.
- * 2. If the lookahead table contains no entry, the next code must be
- *    more than HUFF_LOOKAHEAD bits long.
- * 3. jpeg_huff_decode returns -1 if forced to suspend.
- */
-
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
-  if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-    if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
-    } \
-  } \
-  look = PEEK_BITS(HUFF_LOOKAHEAD); \
-  if ((nb = htbl->look_nbits[look]) != 0) { \
-    DROP_BITS(nb); \
-    result = htbl->look_sym[look]; \
-  } else { \
-    nb = HUFF_LOOKAHEAD+1; \
-slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-    { failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
-  } \
-}
-
-/* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-    JPP((bitread_working_state * state, register bit_buf_type get_buffer,
-         register int bits_left, d_derived_tbl * htbl, int min_bits));
--- a/3rdparty/libjpeg/jdinput.c
+++ b/3rdparty/libjpeg/jdinput.c
@@ -2,13 +2,14 @@
 * jdinput.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains input control logic for the JPEG decompressor.
 * These routines are concerned with controlling the decompressor's input
 * processing (marker reading and coefficient decoding).  The actual input
- * reading is done in jdmarker.c, jdhuff.c, and jdphuff.c.
+ * reading is done in jdmarker.c, jdhuff.c, and jdarith.c.
 */

 #define JPEG_INTERNALS
@@ -21,7 +22,7 @@
 typedef struct {
  struct jpeg_input_controller pub; /* public fields */

-  boolean inheaders;		/* TRUE until first SOS is reached */
+  int inheaders;		/* Nonzero until first SOS is reached */
 } my_input_controller;

 typedef my_input_controller * my_inputctl_ptr;
@@ -35,6 +36,174 @@ METHODDEF(int) consume_markers JPP((j_decompress_ptr cinfo));
 * Routines to calculate various quantities related to the size of the image.
 */

+
+/*
+ * Compute output image dimensions and related values.
+ * NOTE: this is exported for possible use by application.
+ * Hence it mustn't do anything that can't be done twice.
+ */
+
+GLOBAL(void)
+jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+/* Do computations that are needed before master selection phase.
+ * This function is used for transcoding and full decompression.
+ */
+{
+#ifdef IDCT_SCALING_SUPPORTED
+  int ci;
+  jpeg_component_info *compptr;
+
+  /* Compute actual output image dimensions and DCT scaling choices. */
+  if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom) {
+    /* Provide 1/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 1;
+    cinfo->min_DCT_v_scaled_size = 1;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 2) {
+    /* Provide 2/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 2L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 2L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 2;
+    cinfo->min_DCT_v_scaled_size = 2;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 3) {
+    /* Provide 3/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 3L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 3L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 3;
+    cinfo->min_DCT_v_scaled_size = 3;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 4) {
+    /* Provide 4/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 4L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 4L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 4;
+    cinfo->min_DCT_v_scaled_size = 4;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 5) {
+    /* Provide 5/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 5L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 5L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 5;
+    cinfo->min_DCT_v_scaled_size = 5;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 6) {
+    /* Provide 6/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 6L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 6L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 6;
+    cinfo->min_DCT_v_scaled_size = 6;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 7) {
+    /* Provide 7/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 7L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 7L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 7;
+    cinfo->min_DCT_v_scaled_size = 7;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 8) {
+    /* Provide 8/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 8L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 8L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 8;
+    cinfo->min_DCT_v_scaled_size = 8;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 9) {
+    /* Provide 9/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 9L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 9L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 9;
+    cinfo->min_DCT_v_scaled_size = 9;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 10) {
+    /* Provide 10/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 10L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 10L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 10;
+    cinfo->min_DCT_v_scaled_size = 10;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 11) {
+    /* Provide 11/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 11L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 11L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 11;
+    cinfo->min_DCT_v_scaled_size = 11;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 12) {
+    /* Provide 12/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 12L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 12L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 12;
+    cinfo->min_DCT_v_scaled_size = 12;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 13) {
+    /* Provide 13/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 13L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 13L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 13;
+    cinfo->min_DCT_v_scaled_size = 13;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 14) {
+    /* Provide 14/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 14L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 14L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 14;
+    cinfo->min_DCT_v_scaled_size = 14;
+  } else if (cinfo->scale_num * cinfo->block_size <= cinfo->scale_denom * 15) {
+    /* Provide 15/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 15L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 15L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 15;
+    cinfo->min_DCT_v_scaled_size = 15;
+  } else {
+    /* Provide 16/block_size scaling */
+    cinfo->output_width = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_width * 16L, (long) cinfo->block_size);
+    cinfo->output_height = (JDIMENSION)
+      jdiv_round_up((long) cinfo->image_height * 16L, (long) cinfo->block_size);
+    cinfo->min_DCT_h_scaled_size = 16;
+    cinfo->min_DCT_v_scaled_size = 16;
+  }
+
+  /* Recompute dimensions of components */
+  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+       ci++, compptr++) {
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size;
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size;
+  }
+
+#else /* !IDCT_SCALING_SUPPORTED */
+
+  /* Hardwire it to "no scaling" */
+  cinfo->output_width = cinfo->image_width;
+  cinfo->output_height = cinfo->image_height;
+  /* jdinput.c has already initialized DCT_scaled_size,
+   * and has computed unscaled downsampled_width and downsampled_height.
+   */
+
+#endif /* IDCT_SCALING_SUPPORTED */
+}
+
+
 LOCAL(void)
 initial_setup (j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
@@ -70,23 +239,121 @@ initial_setup (j_decompress_ptr cinfo)
                                   compptr->v_samp_factor);
  }

-  /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
-   * In the full decompressor, this will be overridden by jdmaster.c;
-   * but in the transcoder, jdmaster.c is not used, so we must do it here.
+  /* Derive block_size, natural_order, and lim_Se */
+  if (cinfo->is_baseline || (cinfo->progressive_mode &&
+      cinfo->comps_in_scan)) { /* no pseudo SOS marker */
+    cinfo->block_size = DCTSIZE;
+    cinfo->natural_order = jpeg_natural_order;
+    cinfo->lim_Se = DCTSIZE2-1;
+  } else
+    switch (cinfo->Se) {
+    case (1*1-1):
+      cinfo->block_size = 1;
+      cinfo->natural_order = jpeg_natural_order; /* not needed */
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (2*2-1):
+      cinfo->block_size = 2;
+      cinfo->natural_order = jpeg_natural_order2;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (3*3-1):
+      cinfo->block_size = 3;
+      cinfo->natural_order = jpeg_natural_order3;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (4*4-1):
+      cinfo->block_size = 4;
+      cinfo->natural_order = jpeg_natural_order4;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (5*5-1):
+      cinfo->block_size = 5;
+      cinfo->natural_order = jpeg_natural_order5;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (6*6-1):
+      cinfo->block_size = 6;
+      cinfo->natural_order = jpeg_natural_order6;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (7*7-1):
+      cinfo->block_size = 7;
+      cinfo->natural_order = jpeg_natural_order7;
+      cinfo->lim_Se = cinfo->Se;
+      break;
+    case (8*8-1):
+      cinfo->block_size = 8;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (9*9-1):
+      cinfo->block_size = 9;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (10*10-1):
+      cinfo->block_size = 10;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (11*11-1):
+      cinfo->block_size = 11;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (12*12-1):
+      cinfo->block_size = 12;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (13*13-1):
+      cinfo->block_size = 13;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (14*14-1):
+      cinfo->block_size = 14;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (15*15-1):
+      cinfo->block_size = 15;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    case (16*16-1):
+      cinfo->block_size = 16;
+      cinfo->natural_order = jpeg_natural_order;
+      cinfo->lim_Se = DCTSIZE2-1;
+      break;
+    default:
+      ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
+               cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
+      break;
+    }
+
+  /* We initialize DCT_scaled_size and min_DCT_scaled_size to block_size.
+   * In the full decompressor,
+   * this will be overridden by jpeg_calc_output_dimensions in jdmaster.c;
+   * but in the transcoder,
+   * jpeg_calc_output_dimensions is not used, so we must do it here.
   */
-  cinfo->min_DCT_scaled_size = DCTSIZE;
+  cinfo->min_DCT_h_scaled_size = cinfo->block_size;
+  cinfo->min_DCT_v_scaled_size = cinfo->block_size;

  /* Compute dimensions of components */
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    compptr->DCT_scaled_size = DCTSIZE;
+    compptr->DCT_h_scaled_size = cinfo->block_size;
+    compptr->DCT_v_scaled_size = cinfo->block_size;
    /* Size in DCT blocks */
    compptr->width_in_blocks = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-            (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    compptr->height_in_blocks = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-            (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
    /* downsampled_width and downsampled_height will also be overridden by
     * jdmaster.c if we are doing full decompression.  The transcoder library
     * doesn't use these values, but the calling application might.
@@ -107,7 +374,7 @@ initial_setup (j_decompress_ptr cinfo)
  /* Compute number of fully interleaved MCU rows. */
  cinfo->total_iMCU_rows = (JDIMENSION)
    jdiv_round_up((long) cinfo->image_height,
-          (long) (cinfo->max_v_samp_factor*DCTSIZE));
+                  (long) (cinfo->max_v_samp_factor * cinfo->block_size));

  /* Decide whether file contains multiple scans */
  if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -138,7 +405,7 @@ per_scan_setup (j_decompress_ptr cinfo)
    compptr->MCU_width = 1;
    compptr->MCU_height = 1;
    compptr->MCU_blocks = 1;
-    compptr->MCU_sample_width = compptr->DCT_scaled_size;
+    compptr->MCU_sample_width = compptr->DCT_h_scaled_size;
    compptr->last_col_width = 1;
    /* For noninterleaved scans, it is convenient to define last_row_height
     * as the number of block rows present in the last iMCU row.
@@ -161,10 +428,10 @@ per_scan_setup (j_decompress_ptr cinfo)
    /* Overall image size in MCUs */
    cinfo->MCUs_per_row = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width,
-            (long) (cinfo->max_h_samp_factor*DCTSIZE));
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    cinfo->MCU_rows_in_scan = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height,
-            (long) (cinfo->max_v_samp_factor*DCTSIZE));
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));

    cinfo->blocks_in_MCU = 0;

@@ -174,7 +441,7 @@ per_scan_setup (j_decompress_ptr cinfo)
      compptr->MCU_width = compptr->h_samp_factor;
      compptr->MCU_height = compptr->v_samp_factor;
      compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width * compptr->DCT_h_scaled_size;
      /* Figure number of non-dummy blocks in last MCU column & row */
      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
      if (tmp == 0) tmp = compptr->MCU_width;
@@ -282,6 +549,10 @@ finish_input_pass (j_decompress_ptr cinfo)
 * The consume_input method pointer points either here or to the
 * coefficient controller's consume_data routine, depending on whether
 * we are reading a compressed data segment or inter-segment markers.
+ *
+ * Note: This function should NOT return a pseudo SOS marker (with zero
+ * component number) to the caller.  A pseudo marker received by
+ * read_markers is processed and then skipped for other markers.
 */

 METHODDEF(int)
@@ -293,13 +564,19 @@ consume_markers (j_decompress_ptr cinfo)
  if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
    return JPEG_REACHED_EOI;

+  for (;;) {			/* Loop to pass pseudo SOS marker */
    val = (*cinfo->marker->read_markers) (cinfo);

    switch (val) {
    case JPEG_REACHED_SOS:	/* Found SOS */
      if (inputctl->inheaders) { /* 1st SOS */
+        if (inputctl->inheaders == 1)
          initial_setup(cinfo);
-      inputctl->inheaders = FALSE;
+        if (cinfo->comps_in_scan == 0) { /* pseudo SOS marker */
+          inputctl->inheaders = 2;
+          break;
+        }
+        inputctl->inheaders = 0;
        /* Note: start_input_pass must be called by jdmaster.c
         * before any more input can be consumed.  jdapimin.c is
         * responsible for enforcing this sequencing.
@@ -307,9 +584,11 @@ consume_markers (j_decompress_ptr cinfo)
      } else {			/* 2nd or later SOS marker */
        if (! inputctl->pub.has_multiple_scans)
          ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
+        if (cinfo->comps_in_scan == 0) /* unexpected pseudo SOS marker */
+          break;
        start_input_pass(cinfo);
      }
-    break;
+      return val;
    case JPEG_REACHED_EOI:	/* Found EOI */
      inputctl->pub.eoi_reached = TRUE;
      if (inputctl->inheaders) { /* Tables-only datastream, apparently */
@@ -322,12 +601,13 @@ consume_markers (j_decompress_ptr cinfo)
        if (cinfo->output_scan_number > cinfo->input_scan_number)
          cinfo->output_scan_number = cinfo->input_scan_number;
      }
-    break;
-  case JPEG_SUSPENDED:
-    break;
-  }
-
      return val;
+    case JPEG_SUSPENDED:
+      return val;
+    default:
+      return val;
+    }
+  }
 }


@@ -343,7 +623,7 @@ reset_input_controller (j_decompress_ptr cinfo)
  inputctl->pub.consume_input = consume_markers;
  inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
  inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
  /* Reset other modules */
  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
  (*cinfo->marker->reset_marker_reader) (cinfo);
@@ -377,5 +657,5 @@ jinit_input_controller (j_decompress_ptr cinfo)
   */
  inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
  inputctl->pub.eoi_reached = FALSE;
-  inputctl->inheaders = TRUE;
+  inputctl->inheaders = 1;
 }
--- a/3rdparty/libjpeg/jdmainct.c
+++ b/3rdparty/libjpeg/jdmainct.c
@@ -2,15 +2,16 @@
 * jdmainct.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2002-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
- * This file contains the main_ptr buffer controller for decompression.
- * The main_ptr buffer lies between the JPEG decompressor proper and the
+ * This file contains the main buffer controller for decompression.
+ * The main buffer lies between the JPEG decompressor proper and the
 * post-processor; it holds downsampled data in the JPEG colorspace.
 *
 * Note that this code is bypassed in raw-data mode, since the application
- * supplies the equivalent of the main_ptr buffer in that case.
+ * supplies the equivalent of the main buffer in that case.
 */

 #define JPEG_INTERNALS
@@ -19,9 +20,9 @@


 /*
- * In the current system design, the main_ptr buffer need never be a full-image
+ * In the current system design, the main buffer need never be a full-image
 * buffer; any full-height buffers will be found inside the coefficient or
- * postprocessing controllers.  Nonetheless, the main_ptr controller is not
+ * postprocessing controllers.  Nonetheless, the main controller is not
 * trivial.  Its responsibility is to provide context rows for upsampling/
 * rescaling, and doing this in an efficient fashion is a bit tricky.
 *
@@ -159,24 +160,24 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 * This is done only once, not once per pass.
 */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  int ci, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY xbuf;

  /* Get top-level space for component array pointers.
   * We alloc both arrays with one call to save a few cycles.
   */
-  main_ptr->xbuffer[0] = (JSAMPIMAGE)
+  mainp->xbuffer[0] = (JSAMPIMAGE)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                cinfo->num_components * 2 * SIZEOF(JSAMPARRAY));
-  main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
+  mainp->xbuffer[1] = mainp->xbuffer[0] + cinfo->num_components;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
    /* Get space for pointer lists --- M+4 row groups in each list.
     * We alloc both pointer lists with one call to save a few cycles.
     */
@@ -184,9 +185,9 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                  2 * (rgroup * (M + 4)) * SIZEOF(JSAMPROW));
    xbuf += rgroup;		/* want one row group at negative offsets */
-    main_ptr->xbuffer[0][ci] = xbuf;
+    mainp->xbuffer[0][ci] = xbuf;
    xbuf += rgroup * (M + 4);
-    main_ptr->xbuffer[1][ci] = xbuf;
+    mainp->xbuffer[1][ci] = xbuf;
  }
 }

@@ -194,26 +195,26 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 LOCAL(void)
 make_funny_pointers (j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
- * The actual workspace is already allocated (in main_ptr->buffer),
+ * The actual workspace is already allocated (in main->buffer),
 * and the space for the pointer lists is allocated too.
 * This routine just fills in the curiously ordered lists.
 * This will be repeated at the beginning of each pass.
 */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY buf, xbuf0, xbuf1;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = main_ptr->xbuffer[0][ci];
-    xbuf1 = main_ptr->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = mainp->xbuffer[0][ci];
+    xbuf1 = mainp->xbuffer[1][ci];
    /* First copy the workspace pointers as-is */
-    buf = main_ptr->buffer[ci];
+    buf = mainp->buffer[ci];
    for (i = 0; i < rgroup * (M + 2); i++) {
      xbuf0[i] = xbuf1[i] = buf[i];
    }
@@ -240,18 +241,18 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
 * This changes the pointer list state from top-of-image to the normal state.
 */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  int ci, i, rgroup;
-  int M = cinfo->min_DCT_scaled_size;
+  int M = cinfo->min_DCT_v_scaled_size;
  jpeg_component_info *compptr;
  JSAMPARRAY xbuf0, xbuf1;

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    xbuf0 = main_ptr->xbuffer[0][ci];
-    xbuf1 = main_ptr->xbuffer[1][ci];
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    xbuf0 = mainp->xbuffer[0][ci];
+    xbuf1 = mainp->xbuffer[1][ci];
    for (i = 0; i < rgroup; i++) {
      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
@@ -269,7 +270,7 @@ set_bottom_pointers (j_decompress_ptr cinfo)
 * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
 */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  int ci, i, rgroup, iMCUheight, rows_left;
  jpeg_component_info *compptr;
  JSAMPARRAY xbuf;
@@ -277,8 +278,8 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
    /* Count sample rows in one iMCU row and in one row group */
-    iMCUheight = compptr->v_samp_factor * compptr->DCT_scaled_size;
-    rgroup = iMCUheight / cinfo->min_DCT_scaled_size;
+    iMCUheight = compptr->v_samp_factor * compptr->DCT_v_scaled_size;
+    rgroup = iMCUheight / cinfo->min_DCT_v_scaled_size;
    /* Count nondummy sample rows remaining for this component */
    rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
    if (rows_left == 0) rows_left = iMCUheight;
@@ -286,12 +287,12 @@ set_bottom_pointers (j_decompress_ptr cinfo)
     * so we need only do it once.
     */
    if (ci == 0) {
-      main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      mainp->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
    }
    /* Duplicate the last real sample row rgroup*2 times; this pads out the
     * last partial rowgroup and ensures at least one full rowgroup of context.
     */
-    xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
+    xbuf = mainp->xbuffer[mainp->whichptr][ci];
    for (i = 0; i < rgroup * 2; i++) {
      xbuf[rows_left + i] = xbuf[rows_left-1];
    }
@@ -306,27 +307,27 @@ set_bottom_pointers (j_decompress_ptr cinfo)
 METHODDEF(void)
 start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;

  switch (pass_mode) {
  case JBUF_PASS_THRU:
    if (cinfo->upsample->need_context_rows) {
-      main_ptr->pub.process_data = process_data_context_main;
+      mainp->pub.process_data = process_data_context_main;
      make_funny_pointers(cinfo); /* Create the xbuffer[] lists */
-      main_ptr->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
-      main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
-      main_ptr->iMCU_row_ctr = 0;
+      mainp->whichptr = 0;	/* Read first iMCU row into xbuffer[0] */
+      mainp->context_state = CTX_PREPARE_FOR_IMCU;
+      mainp->iMCU_row_ctr = 0;
    } else {
      /* Simple case with no context needed */
-      main_ptr->pub.process_data = process_data_simple_main;
+      mainp->pub.process_data = process_data_simple_main;
    }
-    main_ptr->buffer_full = FALSE;	/* Mark buffer empty */
-    main_ptr->rowgroup_ctr = 0;
+    mainp->buffer_full = FALSE;	/* Mark buffer empty */
+    mainp->rowgroup_ctr = 0;
    break;
 #ifdef QUANT_2PASS_SUPPORTED
  case JBUF_CRANK_DEST:
    /* For last pass of 2-pass quantization, just crank the postprocessor */
-    main_ptr->pub.process_data = process_data_crank_post;
+    mainp->pub.process_data = process_data_crank_post;
    break;
 #endif
  default:
@@ -346,32 +347,32 @@ process_data_simple_main (j_decompress_ptr cinfo,
                          JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
                          JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;
  JDIMENSION rowgroups_avail;

-  /* Read input data if we haven't filled the main_ptr buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+  /* Read input data if we haven't filled the main buffer yet */
+  if (! mainp->buffer_full) {
+    if (! (*cinfo->coef->decompress_data) (cinfo, mainp->buffer))
      return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    mainp->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
  }

  /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION) cinfo->min_DCT_v_scaled_size;
  /* Note: at the bottom of the image, we may pass extra garbage row groups
   * to the postprocessor.  The postprocessor has to check for bottom
   * of image anyway (at row resolution), so no point in us doing it too.
   */

  /* Feed the postprocessor */
-  (*cinfo->post->post_process_data) (cinfo, main_ptr->buffer,
-                     &main_ptr->rowgroup_ctr, rowgroups_avail,
+  (*cinfo->post->post_process_data) (cinfo, mainp->buffer,
+                                     &mainp->rowgroup_ctr, rowgroups_avail,
                                     output_buf, out_row_ctr, out_rows_avail);

  /* Has postprocessor consumed all the data yet? If so, mark buffer empty */
-  if (main_ptr->rowgroup_ctr >= rowgroups_avail) {
-    main_ptr->buffer_full = FALSE;
-    main_ptr->rowgroup_ctr = 0;
+  if (mainp->rowgroup_ctr >= rowgroups_avail) {
+    mainp->buffer_full = FALSE;
+    mainp->rowgroup_ctr = 0;
  }
 }

@@ -386,15 +387,15 @@ process_data_context_main (j_decompress_ptr cinfo,
                           JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
                           JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr mainp = (my_main_ptr) cinfo->main;

-  /* Read input data if we haven't filled the main_ptr buffer yet */
-  if (! main_ptr->buffer_full) {
+  /* Read input data if we haven't filled the main buffer yet */
+  if (! mainp->buffer_full) {
    if (! (*cinfo->coef->decompress_data) (cinfo,
-                       main_ptr->xbuffer[main_ptr->whichptr]))
+                                           mainp->xbuffer[mainp->whichptr]))
      return;			/* suspension forced, can do nothing more */
-    main_ptr->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
-    main_ptr->iMCU_row_ctr++;	/* count rows received */
+    mainp->buffer_full = TRUE;	/* OK, we have an iMCU row to work with */
+    mainp->iMCU_row_ctr++;	/* count rows received */
  }

  /* Postprocessor typically will not swallow all the input data it is handed
@@ -402,47 +403,47 @@ process_data_context_main (j_decompress_ptr cinfo,
   * to exit and restart.  This switch lets us keep track of how far we got.
   * Note that each case falls through to the next on successful completion.
   */
-  switch (main_ptr->context_state) {
+  switch (mainp->context_state) {
  case CTX_POSTPONED_ROW:
    /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-            &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, mainp->xbuffer[mainp->whichptr],
+                        &mainp->rowgroup_ctr, mainp->rowgroups_avail,
                        output_buf, out_row_ctr, out_rows_avail);
-    if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+    if (mainp->rowgroup_ctr < mainp->rowgroups_avail)
      return;			/* Need to suspend */
-    main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
+    mainp->context_state = CTX_PREPARE_FOR_IMCU;
    if (*out_row_ctr >= out_rows_avail)
      return;			/* Postprocessor exactly filled output buf */
    /*FALLTHROUGH*/
  case CTX_PREPARE_FOR_IMCU:
    /* Prepare to process first M-1 row groups of this iMCU row */
-    main_ptr->rowgroup_ctr = 0;
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size - 1);
+    mainp->rowgroup_ctr = 0;
+    mainp->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size - 1);
    /* Check for bottom of image: if so, tweak pointers to "duplicate"
     * the last sample row, and adjust rowgroups_avail to ignore padding rows.
     */
-    if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
+    if (mainp->iMCU_row_ctr == cinfo->total_iMCU_rows)
      set_bottom_pointers(cinfo);
-    main_ptr->context_state = CTX_PROCESS_IMCU;
+    mainp->context_state = CTX_PROCESS_IMCU;
    /*FALLTHROUGH*/
  case CTX_PROCESS_IMCU:
    /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-            &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
+    (*cinfo->post->post_process_data) (cinfo, mainp->xbuffer[mainp->whichptr],
+                        &mainp->rowgroup_ctr, mainp->rowgroups_avail,
                        output_buf, out_row_ctr, out_rows_avail);
-    if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
+    if (mainp->rowgroup_ctr < mainp->rowgroups_avail)
      return;			/* Need to suspend */
    /* After the first iMCU, change wraparound pointers to normal state */
-    if (main_ptr->iMCU_row_ctr == 1)
+    if (mainp->iMCU_row_ctr == 1)
      set_wraparound_pointers(cinfo);
    /* Prepare to load new iMCU row using other xbuffer list */
-    main_ptr->whichptr ^= 1;	/* 0=>1 or 1=>0 */
-    main_ptr->buffer_full = FALSE;
+    mainp->whichptr ^= 1;	/* 0=>1 or 1=>0 */
+    mainp->buffer_full = FALSE;
    /* Still need to process last row group of this iMCU row, */
    /* which is saved at index M+1 of the other xbuffer */
-    main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_scaled_size + 1);
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_scaled_size + 2);
-    main_ptr->context_state = CTX_POSTPONED_ROW;
+    mainp->rowgroup_ctr = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 1);
+    mainp->rowgroups_avail = (JDIMENSION) (cinfo->min_DCT_v_scaled_size + 2);
+    mainp->context_state = CTX_POSTPONED_ROW;
  }
 }

@@ -469,21 +470,21 @@ process_data_crank_post (j_decompress_ptr cinfo,


 /*
- * Initialize main_ptr buffer controller.
+ * Initialize main buffer controller.
 */

 GLOBAL(void)
 jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 {
-  my_main_ptr main_ptr;
+  my_main_ptr mainp;
  int ci, rgroup, ngroups;
  jpeg_component_info *compptr;

-  main_ptr = (my_main_ptr)
+  mainp = (my_main_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
                                SIZEOF(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
-  main_ptr->pub.start_pass = start_pass_main;
+  cinfo->main = &mainp->pub;
+  mainp->pub.start_pass = start_pass_main;

  if (need_full_buffer)		/* shouldn't happen */
    ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -492,21 +493,21 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
   * ngroups is the number of row groups we need.
   */
  if (cinfo->upsample->need_context_rows) {
-    if (cinfo->min_DCT_scaled_size < 2) /* unsupported, see comments above */
+    if (cinfo->min_DCT_v_scaled_size < 2) /* unsupported, see comments above */
      ERREXIT(cinfo, JERR_NOTIMPL);
    alloc_funny_pointers(cinfo); /* Alloc space for xbuffer[] lists */
-    ngroups = cinfo->min_DCT_scaled_size + 2;
+    ngroups = cinfo->min_DCT_v_scaled_size + 2;
  } else {
-    ngroups = cinfo->min_DCT_scaled_size;
+    ngroups = cinfo->min_DCT_v_scaled_size;
  }

  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    rgroup = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-      cinfo->min_DCT_scaled_size; /* height of a row group of component */
-    main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
+    rgroup = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+      cinfo->min_DCT_v_scaled_size; /* height of a row group of component */
+    mainp->buffer[ci] = (*cinfo->mem->alloc_sarray)
      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-             compptr->width_in_blocks * compptr->DCT_scaled_size,
+       compptr->width_in_blocks * ((JDIMENSION) compptr->DCT_h_scaled_size),
       (JDIMENSION) (rgroup * ngroups));
  }
 }
--- a/3rdparty/libjpeg/jdmarker.c
+++ b/3rdparty/libjpeg/jdmarker.c
@@ -2,6 +2,7 @@
 * jdmarker.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2009-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -76,6 +77,7 @@ typedef enum {			/* JPEG marker codes */
  M_APP15 = 0xef,

  M_JPG0  = 0xf0,
+  M_JPG8  = 0xf8,
  M_JPG13 = 0xfd,
  M_COM   = 0xfe,

@@ -216,6 +218,7 @@ get_soi (j_decompress_ptr cinfo)
  /* Set initial assumptions for colorspace etc */

  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->color_transform = JCT_NONE;
  cinfo->CCIR601_sampling = FALSE; /* Assume non-CCIR sampling??? */

  cinfo->saw_JFIF_marker = FALSE;
@@ -234,14 +237,16 @@ get_soi (j_decompress_ptr cinfo)


 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof (j_decompress_ptr cinfo, boolean is_baseline, boolean is_prog,
+         boolean is_arith)
 /* Process a SOFn marker */
 {
  INT32 length;
-  int c, ci;
+  int c, ci, i;
  jpeg_component_info * compptr;
  INPUT_VARS(cinfo);

+  cinfo->is_baseline = is_baseline;
  cinfo->progressive_mode = is_prog;
  cinfo->arith_code = is_arith;

@@ -276,10 +281,26 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
                         cinfo->num_components * SIZEOF(jpeg_component_info));

-  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
-       ci++, compptr++) {
+  for (ci = 0; ci < cinfo->num_components; ci++) {
+    INPUT_BYTE(cinfo, c, return FALSE);
+    /* Check to see whether component id has already been seen   */
+    /* (in violation of the spec, but unfortunately seen in some */
+    /* files).  If so, create "fake" component id equal to the   */
+    /* max id seen so far + 1. */
+    for (i = 0, compptr = cinfo->comp_info; i < ci; i++, compptr++) {
+      if (c == compptr->component_id) {
+        compptr = cinfo->comp_info;
+        c = compptr->component_id;
+        compptr++;
+        for (i = 1; i < ci; i++, compptr++) {
+          if (compptr->component_id > c) c = compptr->component_id;
+        }
+        c++;
+        break;
+      }
+    }
+    compptr->component_id = c;
    compptr->component_index = ci;
-    INPUT_BYTE(cinfo, compptr->component_id, return FALSE);
    INPUT_BYTE(cinfo, c, return FALSE);
    compptr->h_samp_factor = (c >> 4) & 15;
    compptr->v_samp_factor = (c     ) & 15;
@@ -302,12 +323,12 @@ get_sos (j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
  INT32 length;
-  int i, ci, n, c, cc;
+  int c, ci, i, n;
  jpeg_component_info * compptr;
  INPUT_VARS(cinfo);

  if (! cinfo->marker->saw_SOF)
-    ERREXIT(cinfo, JERR_SOS_NO_SOF);
+    ERREXITS(cinfo, JERR_SOF_BEFORE, "SOS");

  INPUT_2BYTES(cinfo, length, return FALSE);

@@ -315,7 +336,9 @@ get_sos (j_decompress_ptr cinfo)

  TRACEMS1(cinfo, 1, JTRC_SOS, n);

-  if (length != (n * 2 + 6) || n < 1 || n > MAX_COMPS_IN_SCAN)
+  if (length != (n * 2 + 6) || n > MAX_COMPS_IN_SCAN ||
+      (n == 0 && !cinfo->progressive_mode))
+      /* pseudo SOS marker only allowed in progressive mode */
    ERREXIT(cinfo, JERR_BAD_LENGTH);

  cinfo->comps_in_scan = n;
@@ -323,24 +346,38 @@ get_sos (j_decompress_ptr cinfo)
  /* Collect the component-spec parameters */

  for (i = 0; i < n; i++) {
-    INPUT_BYTE(cinfo, cc, return FALSE);
    INPUT_BYTE(cinfo, c, return FALSE);

+    /* Detect the case where component id's are not unique, and, if so, */
+    /* create a fake component id using the same logic as in get_sof.   */
+    for (ci = 0; ci < i; ci++) {
+      if (c == cinfo->cur_comp_info[ci]->component_id) {
+        c = cinfo->cur_comp_info[0]->component_id;
+        for (ci = 1; ci < i; ci++) {
+          compptr = cinfo->cur_comp_info[ci];
+          if (compptr->component_id > c) c = compptr->component_id;
+        }
+        c++;
+        break;
+      }
+    }
+
    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
         ci++, compptr++) {
-      if (cc == compptr->component_id)
+      if (c == compptr->component_id)
        goto id_found;
    }

-    ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
+    ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, c);

  id_found:

    cinfo->cur_comp_info[i] = compptr;
+    INPUT_BYTE(cinfo, c, return FALSE);
    compptr->dc_tbl_no = (c >> 4) & 15;
    compptr->ac_tbl_no = (c     ) & 15;

-    TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, cc,
+    TRACEMS3(cinfo, 1, JTRC_SOS_COMPONENT, compptr->component_id,
             compptr->dc_tbl_no, compptr->ac_tbl_no);
  }

@@ -359,8 +396,8 @@ get_sos (j_decompress_ptr cinfo)
  /* Prepare to scan data & restart markers */
  cinfo->marker->next_restart_num = 0;

-  /* Count another SOS marker */
-  cinfo->input_scan_number++;
+  /* Count another (non-pseudo) SOS marker */
+  if (n) cinfo->input_scan_number++;

  INPUT_SYNC(cinfo);
  return TRUE;
@@ -490,16 +527,18 @@ LOCAL(boolean)
 get_dqt (j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
-  INT32 length;
-  int n, i, prec;
+  INT32 length, count, i;
+  int n, prec;
  unsigned int tmp;
  JQUANT_TBL *quant_ptr;
+  const int *natural_order;
  INPUT_VARS(cinfo);

  INPUT_2BYTES(cinfo, length, return FALSE);
  length -= 2;

  while (length > 0) {
+    length--;
    INPUT_BYTE(cinfo, n, return FALSE);
    prec = n >> 4;
    n &= 0x0F;
@@ -513,13 +552,43 @@ get_dqt (j_decompress_ptr cinfo)
      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
    quant_ptr = cinfo->quant_tbl_ptrs[n];

+    if (prec) {
+      if (length < DCTSIZE2 * 2) {
+        /* Initialize full table for safety. */
        for (i = 0; i < DCTSIZE2; i++) {
+          quant_ptr->quantval[i] = 1;
+        }
+        count = length >> 1;
+      } else
+        count = DCTSIZE2;
+    } else {
+      if (length < DCTSIZE2) {
+        /* Initialize full table for safety. */
+        for (i = 0; i < DCTSIZE2; i++) {
+          quant_ptr->quantval[i] = 1;
+        }
+        count = length;
+      } else
+        count = DCTSIZE2;
+    }
+
+    switch (count) {
+    case (2*2): natural_order = jpeg_natural_order2; break;
+    case (3*3): natural_order = jpeg_natural_order3; break;
+    case (4*4): natural_order = jpeg_natural_order4; break;
+    case (5*5): natural_order = jpeg_natural_order5; break;
+    case (6*6): natural_order = jpeg_natural_order6; break;
+    case (7*7): natural_order = jpeg_natural_order7; break;
+    default:    natural_order = jpeg_natural_order;  break;
+    }
+
+    for (i = 0; i < count; i++) {
      if (prec)
        INPUT_2BYTES(cinfo, tmp, return FALSE);
      else
        INPUT_BYTE(cinfo, tmp, return FALSE);
      /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[natural_order[i]] = (UINT16) tmp;
    }

    if (cinfo->err->trace_level >= 2) {
@@ -532,8 +601,8 @@ get_dqt (j_decompress_ptr cinfo)
      }
    }

-    length -= DCTSIZE2+1;
-    if (prec) length -= DCTSIZE2;
+    length -= count;
+    if (prec) length -= count;
  }

  if (length != 0)
@@ -568,6 +637,68 @@ get_dri (j_decompress_ptr cinfo)
 }


+LOCAL(boolean)
+get_lse (j_decompress_ptr cinfo)
+/* Process an LSE marker */
+{
+  INT32 length;
+  unsigned int tmp;
+  int cid;
+  INPUT_VARS(cinfo);
+
+  if (! cinfo->marker->saw_SOF)
+    ERREXITS(cinfo, JERR_SOF_BEFORE, "LSE");
+
+  if (cinfo->num_components < 3) goto bad;
+
+  INPUT_2BYTES(cinfo, length, return FALSE);
+
+  if (length != 24)
+    ERREXIT(cinfo, JERR_BAD_LENGTH);
+
+  INPUT_BYTE(cinfo, tmp, return FALSE);
+  if (tmp != 0x0D)	/* ID inverse transform specification */
+    ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, cinfo->unread_marker);
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != MAXJSAMPLE) goto bad;		/* MAXTRANS */
+  INPUT_BYTE(cinfo, tmp, return FALSE);
+  if (tmp != 3) goto bad;			/* Nt=3 */
+  INPUT_BYTE(cinfo, cid, return FALSE);
+  if (cid != cinfo->comp_info[1].component_id) goto bad;
+  INPUT_BYTE(cinfo, cid, return FALSE);
+  if (cid != cinfo->comp_info[0].component_id) goto bad;
+  INPUT_BYTE(cinfo, cid, return FALSE);
+  if (cid != cinfo->comp_info[2].component_id) goto bad;
+  INPUT_BYTE(cinfo, tmp, return FALSE);
+  if (tmp != 0x80) goto bad;		/* F1: CENTER1=1, NORM1=0 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 0) goto bad;			/* A(1,1)=0 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 0) goto bad;			/* A(1,2)=0 */
+  INPUT_BYTE(cinfo, tmp, return FALSE);
+  if (tmp != 0) goto bad;		/* F2: CENTER2=0, NORM2=0 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 1) goto bad;			/* A(2,1)=1 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 0) goto bad;			/* A(2,2)=0 */
+  INPUT_BYTE(cinfo, tmp, return FALSE);
+  if (tmp != 0) goto bad;		/* F3: CENTER3=0, NORM3=0 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 1) goto bad;			/* A(3,1)=1 */
+  INPUT_2BYTES(cinfo, tmp, return FALSE);
+  if (tmp != 0) {				/* A(3,2)=0 */
+    bad:
+    ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
+  }
+
+  /* OK, valid transform that we can handle. */
+  cinfo->color_transform = JCT_SUBTRACT_GREEN;
+
+  INPUT_SYNC(cinfo);
+  return TRUE;
+}
+
+
 /*
 * Routines for processing APPn and COM markers.
 * These are either saved in memory or discarded, per application request.
@@ -946,6 +1077,11 @@ first_marker (j_decompress_ptr cinfo)
 *
 * Returns same codes as are defined for jpeg_consume_input:
 * JPEG_SUSPENDED, JPEG_REACHED_SOS, or JPEG_REACHED_EOI.
+ *
+ * Note: This function may return a pseudo SOS marker (with zero
+ * component number) for treat by input controller's consume_input.
+ * consume_input itself should filter out (skip) the pseudo marker
+ * after processing for the caller.
 */

 METHODDEF(int)
@@ -975,23 +1111,27 @@ read_markers (j_decompress_ptr cinfo)
      break;

    case M_SOF0:		/* Baseline */
+      if (! get_sof(cinfo, TRUE, FALSE, FALSE))
+        return JPEG_SUSPENDED;
+      break;
+
    case M_SOF1:		/* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (! get_sof(cinfo, FALSE, FALSE, FALSE))
        return JPEG_SUSPENDED;
      break;

    case M_SOF2:		/* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (! get_sof(cinfo, FALSE, TRUE, FALSE))
        return JPEG_SUSPENDED;
      break;

    case M_SOF9:		/* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (! get_sof(cinfo, FALSE, FALSE, TRUE))
        return JPEG_SUSPENDED;
      break;

    case M_SOF10:		/* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (! get_sof(cinfo, FALSE, TRUE, TRUE))
        return JPEG_SUSPENDED;
      break;

@@ -1039,6 +1179,11 @@ read_markers (j_decompress_ptr cinfo)
        return JPEG_SUSPENDED;
      break;

+    case M_JPG8:
+      if (! get_lse(cinfo))
+        return JPEG_SUSPENDED;
+      break;
+
    case M_APP0:
    case M_APP1:
    case M_APP2:
@@ -1268,7 +1413,7 @@ jinit_marker_reader (j_decompress_ptr cinfo)
  marker = (my_marker_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
                                SIZEOF(my_marker_reader));
-  cinfo->marker = (struct jpeg_marker_reader *) marker;
+  cinfo->marker = &marker->pub;
  /* Initialize public method pointers */
  marker->pub.reset_marker_reader = reset_marker_reader;
  marker->pub.read_markers = read_markers;
--- a/3rdparty/libjpeg/jdmaster.c
+++ b/3rdparty/libjpeg/jdmaster.c
@@ -2,6 +2,7 @@
 * jdmaster.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2002-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -61,9 +62,12 @@ use_merged_upsample (j_decompress_ptr cinfo)
      cinfo->comp_info[2].v_samp_factor != 1)
    return FALSE;
  /* furthermore, it doesn't work if we've scaled the IDCTs differently */
-  if (cinfo->comp_info[0].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[1].DCT_scaled_size != cinfo->min_DCT_scaled_size ||
-      cinfo->comp_info[2].DCT_scaled_size != cinfo->min_DCT_scaled_size)
+  if (cinfo->comp_info[0].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[1].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[2].DCT_h_scaled_size != cinfo->min_DCT_h_scaled_size ||
+      cinfo->comp_info[0].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[1].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size ||
+      cinfo->comp_info[2].DCT_v_scaled_size != cinfo->min_DCT_v_scaled_size)
    return FALSE;
  /* ??? also need to test for upsample-time rescaling, when & if supported */
  return TRUE;			/* by golly, it'll work... */
@@ -82,7 +86,9 @@ use_merged_upsample (j_decompress_ptr cinfo)

 GLOBAL(void)
 jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
-/* Do computations that are needed before master selection phase */
+/* Do computations that are needed before master selection phase.
+ * This function is used for full decompression.
+ */
 {
 #ifdef IDCT_SCALING_SUPPORTED
  int ci;
@@ -93,52 +99,38 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
  if (cinfo->global_state != DSTATE_READY)
    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);

+  /* Compute core output image dimensions and DCT scaling choices. */
+  jpeg_core_output_dimensions(cinfo);
+
 #ifdef IDCT_SCALING_SUPPORTED

-  /* Compute actual output image dimensions and DCT scaling choices. */
-  if (cinfo->scale_num * 8 <= cinfo->scale_denom) {
-    /* Provide 1/8 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 8L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 8L);
-    cinfo->min_DCT_scaled_size = 1;
-  } else if (cinfo->scale_num * 4 <= cinfo->scale_denom) {
-    /* Provide 1/4 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 4L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 4L);
-    cinfo->min_DCT_scaled_size = 2;
-  } else if (cinfo->scale_num * 2 <= cinfo->scale_denom) {
-    /* Provide 1/2 scaling */
-    cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, 2L);
-    cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, 2L);
-    cinfo->min_DCT_scaled_size = 4;
-  } else {
-    /* Provide 1/1 scaling */
-    cinfo->output_width = cinfo->image_width;
-    cinfo->output_height = cinfo->image_height;
-    cinfo->min_DCT_scaled_size = DCTSIZE;
-  }
  /* In selecting the actual DCT scaling for each component, we try to
   * scale up the chroma components via IDCT scaling rather than upsampling.
   * This saves time if the upsampler gets to use 1:1 scaling.
-   * Note this code assumes that the supported DCT scalings are powers of 2.
+   * Note this code adapts subsampling ratios which are powers of 2.
   */
  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
       ci++, compptr++) {
-    int ssize = cinfo->min_DCT_scaled_size;
-    while (ssize < DCTSIZE &&
-       (compptr->h_samp_factor * ssize * 2 <=
-        cinfo->max_h_samp_factor * cinfo->min_DCT_scaled_size) &&
-       (compptr->v_samp_factor * ssize * 2 <=
-        cinfo->max_v_samp_factor * cinfo->min_DCT_scaled_size)) {
+    int ssize = 1;
+    while (cinfo->min_DCT_h_scaled_size * ssize <=
+           (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+           (cinfo->max_h_samp_factor % (compptr->h_samp_factor * ssize * 2)) == 0) {
      ssize = ssize * 2;
    }
-    compptr->DCT_scaled_size = ssize;
+    compptr->DCT_h_scaled_size = cinfo->min_DCT_h_scaled_size * ssize;
+    ssize = 1;
+    while (cinfo->min_DCT_v_scaled_size * ssize <=
+           (cinfo->do_fancy_upsampling ? DCTSIZE : DCTSIZE / 2) &&
+           (cinfo->max_v_samp_factor % (compptr->v_samp_factor * ssize * 2)) == 0) {
+      ssize = ssize * 2;
+    }
+    compptr->DCT_v_scaled_size = cinfo->min_DCT_v_scaled_size * ssize;
+
+    /* We don't support IDCT ratios larger than 2. */
+    if (compptr->DCT_h_scaled_size > compptr->DCT_v_scaled_size * 2)
+        compptr->DCT_h_scaled_size = compptr->DCT_v_scaled_size * 2;
+    else if (compptr->DCT_v_scaled_size > compptr->DCT_h_scaled_size * 2)
+        compptr->DCT_v_scaled_size = compptr->DCT_h_scaled_size * 2;
  }

  /* Recompute downsampled dimensions of components;
@@ -149,23 +141,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    /* Size in samples, after IDCT scaling */
    compptr->downsampled_width = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_width *
-            (long) (compptr->h_samp_factor * compptr->DCT_scaled_size),
-            (long) (cinfo->max_h_samp_factor * DCTSIZE));
+                    (long) (compptr->h_samp_factor * compptr->DCT_h_scaled_size),
+                    (long) (cinfo->max_h_samp_factor * cinfo->block_size));
    compptr->downsampled_height = (JDIMENSION)
      jdiv_round_up((long) cinfo->image_height *
-            (long) (compptr->v_samp_factor * compptr->DCT_scaled_size),
-            (long) (cinfo->max_v_samp_factor * DCTSIZE));
+                    (long) (compptr->v_samp_factor * compptr->DCT_v_scaled_size),
+                    (long) (cinfo->max_v_samp_factor * cinfo->block_size));
  }

-#else /* !IDCT_SCALING_SUPPORTED */
-
-  /* Hardwire it to "no scaling" */
-  cinfo->output_width = cinfo->image_width;
-  cinfo->output_height = cinfo->image_height;
-  /* jdinput.c has already initialized DCT_scaled_size to DCTSIZE,
-   * and has computed unscaled downsampled_width and downsampled_height.
-   */
-
 #endif /* IDCT_SCALING_SUPPORTED */

  /* Report number of components in selected colorspace. */
@@ -175,10 +158,8 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
    cinfo->out_color_components = 1;
    break;
  case JCS_RGB:
-#if RGB_PIXELSIZE != 3
    cinfo->out_color_components = RGB_PIXELSIZE;
    break;
-#endif /* else share code with YCbCr */
  case JCS_YCbCr:
    cinfo->out_color_components = 3;
    break;
@@ -372,16 +353,9 @@ master_selection (j_decompress_ptr cinfo)
  /* Inverse DCT */
  jinit_inverse_dct(cinfo);
  /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
    jinit_huff_decoder(cinfo);
  }

--- a/3rdparty/libjpeg/jdphuff.c
+++ b/3rdparty/libjpeg/jdphuff.c
@@ -1,668 +0,0 @@
-/*
- * jdphuff.c
- *
- * Copyright (C) 1995-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains Huffman entropy decoding routines for progressive JPEG.
- *
- * Much of the complexity here has to do with supporting input suspension.
- * If the data source module demands suspension, we want to be able to back
- * up to the start of the current MCU.  To do this, we copy state variables
- * into local working storage, and update them back to the permanent
- * storage only upon successful completion of an MCU.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdhuff.h"		/* Declarations shared with jdhuff.c */
-
-
-#ifdef D_PROGRESSIVE_SUPPORTED
-
-/*
- * Expanded entropy decoder object for progressive Huffman decoding.
- *
- * The savable_state subrecord contains fields that change within an MCU,
- * but must not be updated permanently until we complete the MCU.
- */
-
-typedef struct {
-  unsigned int EOBRUN;			/* remaining EOBs in EOBRUN */
-  int last_dc_val[MAX_COMPS_IN_SCAN];	/* last DC coef for each component */
-} savable_state;
-
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-    ((dest).EOBRUN = (src).EOBRUN, \
-     (dest).last_dc_val[0] = (src).last_dc_val[0], \
-     (dest).last_dc_val[1] = (src).last_dc_val[1], \
-     (dest).last_dc_val[2] = (src).last_dc_val[2], \
-     (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
-typedef struct {
-  struct jpeg_entropy_decoder pub; /* public fields */
-
-  /* These fields are loaded into local variables at start of each MCU.
-   * In case of suspension, we exit WITHOUT updating them.
-   */
-  bitread_perm_state bitstate;	/* Bit buffer at start of MCU */
-  savable_state saved;		/* Other state at start of MCU */
-
-  /* These fields are NOT loaded into local working state. */
-  unsigned int restarts_to_go;	/* MCUs left in this restart interval */
-
-  /* Pointers to derived tables (these workspaces have image lifespan) */
-  d_derived_tbl * derived_tbls[NUM_HUFF_TBLS];
-
-  d_derived_tbl * ac_derived_tbl; /* active table during an AC scan */
-} phuff_entropy_decoder;
-
-typedef phuff_entropy_decoder * phuff_entropy_ptr;
-
-/* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first JPP((j_decompress_ptr cinfo,
-                        JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_first JPP((j_decompress_ptr cinfo,
-                        JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_DC_refine JPP((j_decompress_ptr cinfo,
-                         JBLOCKROW *MCU_data));
-METHODDEF(boolean) decode_mcu_AC_refine JPP((j_decompress_ptr cinfo,
-                         JBLOCKROW *MCU_data));
-
-
-/*
- * Initialize for a Huffman-compressed scan.
- */
-
-METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  boolean is_DC_band, bad;
-  int ci, coefi, tbl;
-  int *coef_bit_ptr;
-  jpeg_component_info * compptr;
-
-  is_DC_band = (cinfo->Ss == 0);
-
-  /* Validate scan parameters */
-  bad = FALSE;
-  if (is_DC_band) {
-    if (cinfo->Se != 0)
-      bad = TRUE;
-  } else {
-    /* need not check Ss/Se < 0 since they came from unsigned bytes */
-    if (cinfo->Ss > cinfo->Se || cinfo->Se >= DCTSIZE2)
-      bad = TRUE;
-    /* AC scans may have only one component */
-    if (cinfo->comps_in_scan != 1)
-      bad = TRUE;
-  }
-  if (cinfo->Ah != 0) {
-    /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
-      bad = TRUE;
-  }
-  if (cinfo->Al > 13)		/* need not check for < 0 */
-    bad = TRUE;
-  /* Arguably the maximum Al value should be less than 13 for 8-bit precision,
-   * but the spec doesn't say so, and we try to be liberal about what we
-   * accept.  Note: large Al values could result in out-of-range DC
-   * coefficients during early scans, leading to bizarre displays due to
-   * overflows in the IDCT math.  But we won't crash.
-   */
-  if (bad)
-    ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
-         cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
-  /* Update progression status, and verify that scan order is legal.
-   * Note that inter-scan inconsistencies are treated as warnings
-   * not fatal errors ... not clear if this is right way to behave.
-   */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
-    if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
-      WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
-    for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
-      int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
-      if (cinfo->Ah != expected)
-    WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, coefi);
-      coef_bit_ptr[coefi] = cinfo->Al;
-    }
-  }
-
-  /* Select MCU decoding routine */
-  if (cinfo->Ah == 0) {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_first;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_first;
-  } else {
-    if (is_DC_band)
-      entropy->pub.decode_mcu = decode_mcu_DC_refine;
-    else
-      entropy->pub.decode_mcu = decode_mcu_AC_refine;
-  }
-
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-    compptr = cinfo->cur_comp_info[ci];
-    /* Make sure requested tables are present, and compute derived tables.
-     * We may build same derived table more than once, but it's not expensive.
-     */
-    if (is_DC_band) {
-      if (cinfo->Ah == 0) {	/* DC refinement needs no table */
-    tbl = compptr->dc_tbl_no;
-    jpeg_make_d_derived_tbl(cinfo, TRUE, tbl,
-                & entropy->derived_tbls[tbl]);
-      }
-    } else {
-      tbl = compptr->ac_tbl_no;
-      jpeg_make_d_derived_tbl(cinfo, FALSE, tbl,
-                  & entropy->derived_tbls[tbl]);
-      /* remember the single active table */
-      entropy->ac_derived_tbl = entropy->derived_tbls[tbl];
-    }
-    /* Initialize DC predictions to 0 */
-    entropy->saved.last_dc_val[ci] = 0;
-  }
-
-  /* Initialize bitread state variables */
-  entropy->bitstate.bits_left = 0;
-  entropy->bitstate.get_buffer = 0; /* unnecessary, but keeps Purify quiet */
-  entropy->pub.insufficient_data = FALSE;
-
-  /* Initialize private state variables */
-  entropy->saved.EOBRUN = 0;
-
-  /* Initialize restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-}
-
-
-/*
- * Figure F.12: extend sign bit.
- * On some machines, a shift and add will be faster than a table lookup.
- */
-
-#ifdef AVOID_TABLES
-
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((-1)<<(s)) + 1) : (x))
-
-#else
-
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
-
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-
-#endif /* AVOID_TABLES */
-
-
-/*
- * Check for a restart marker & resynchronize decoder.
- * Returns FALSE if must suspend.
- */
-
-LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int ci;
-
-  /* Throw away any unused bits remaining in bit buffer; */
-  /* include any full bytes in next_marker's count of discarded bytes */
-  cinfo->marker->discarded_bytes += entropy->bitstate.bits_left / 8;
-  entropy->bitstate.bits_left = 0;
-
-  /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
-    return FALSE;
-
-  /* Re-initialize DC predictions to 0 */
-  for (ci = 0; ci < cinfo->comps_in_scan; ci++)
-    entropy->saved.last_dc_val[ci] = 0;
-  /* Re-init EOB run count, too */
-  entropy->saved.EOBRUN = 0;
-
-  /* Reset restart counter */
-  entropy->restarts_to_go = cinfo->restart_interval;
-
-  /* Reset out-of-data flag, unless read_restart_marker left us smack up
-   * against a marker.  In that case we will end up treating the next data
-   * segment as empty, and we can avoid producing bogus output pixels by
-   * leaving the flag set.
-   */
-  if (cinfo->unread_marker == 0)
-    entropy->pub.insufficient_data = FALSE;
-
-  return TRUE;
-}
-
-
-/*
- * Huffman MCU decoding.
- * Each of these routines decodes and returns one MCU's worth of
- * Huffman-compressed coefficients.
- * The coefficients are reordered from zigzag order into natural array order,
- * but are not dequantized.
- *
- * The i'th block of the MCU is stored into the block pointed to by
- * MCU_data[i].  WE ASSUME THIS AREA IS INITIALLY ZEROED BY THE CALLER.
- *
- * We return FALSE if data source requested suspension.  In that case no
- * changes have been made to permanent state.  (Exception: some output
- * coefficients may already have been assigned.  This is harmless for
- * spectral selection, since we'll just re-assign them on the next call.
- * Successive approximation AC refinement has to be more careful, however.)
- */
-
-/*
- * MCU decoding for DC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Al = cinfo->Al;
-  register int s, r;
-  int blkn, ci;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  savable_state state;
-  d_derived_tbl * tbl;
-  jpeg_component_info * compptr;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-    return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
-
-    /* Outer loop handles each block in the MCU */
-
-    for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-      block = MCU_data[blkn];
-      ci = cinfo->MCU_membership[blkn];
-      compptr = cinfo->cur_comp_info[ci];
-      tbl = entropy->derived_tbls[compptr->dc_tbl_no];
-
-      /* Decode a single block's worth of coefficients */
-
-      /* Section F.2.2.1: decode the DC coefficient difference */
-      HUFF_DECODE(s, br_state, tbl, return FALSE, label1);
-      if (s) {
-    CHECK_BIT_BUFFER(br_state, s, return FALSE);
-    r = GET_BITS(s);
-    s = HUFF_EXTEND(r, s);
-      }
-
-      /* Convert DC difference to actual value, update last_dc_val */
-      s += state.last_dc_val[ci];
-      state.last_dc_val[ci] = s;
-      /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) (s << Al);
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC initial scan (either spectral selection,
- * or first pass of successive approximation).
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int Al = cinfo->Al;
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-    return FALSE;
-  }
-
-  /* If we've run out of data, just leave the MCU set to zeroes.
-   * This way, we return uniform gray for the remainder of the segment.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state.
-     * We can avoid loading/saving bitread state if in an EOB run.
-     */
-    EOBRUN = entropy->saved.EOBRUN;	/* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-
-    if (EOBRUN > 0)		/* if it's a band of zeroes... */
-      EOBRUN--;			/* ...process it now (we do nothing) */
-    else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-      block = MCU_data[0];
-      tbl = entropy->ac_derived_tbl;
-
-      for (k = cinfo->Ss; k <= Se; k++) {
-    HUFF_DECODE(s, br_state, tbl, return FALSE, label2);
-    r = s >> 4;
-    s &= 15;
-    if (s) {
-      k += r;
-      CHECK_BIT_BUFFER(br_state, s, return FALSE);
-      r = GET_BITS(s);
-      s = HUFF_EXTEND(r, s);
-      /* Scale and output coefficient in natural (dezigzagged) order */
-      (*block)[jpeg_natural_order[k]] = (JCOEF) (s << Al);
-    } else {
-      if (r == 15) {	/* ZRL */
-        k += 15;		/* skip 15 zeroes in band */
-      } else {		/* EOBr, run length is 2^r + appended bits */
-        EOBRUN = 1 << r;
-        if (r) {		/* EOBr, r > 0 */
-          CHECK_BIT_BUFFER(br_state, r, return FALSE);
-          r = GET_BITS(r);
-          EOBRUN += r;
-        }
-        EOBRUN--;		/* this band is processed at this moment */
-        break;		/* force end-of-band */
-      }
-    }
-      }
-
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    }
-
-    /* Completed MCU, so update state */
-    entropy->saved.EOBRUN = EOBRUN;	/* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for DC successive approximation refinement scan.
- * Note: we assume such scans can be multi-component, although the spec
- * is not very clear on the point.
- */
-
-METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int blkn;
-  JBLOCKROW block;
-  BITREAD_STATE_VARS;
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-    return FALSE;
-  }
-
-  /* Not worth the cycles to check insufficient_data here,
-   * since we will not change the data anyway if we read zeroes.
-   */
-
-  /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-
-  /* Outer loop handles each block in the MCU */
-
-  for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
-    block = MCU_data[blkn];
-
-    /* Encoded data is simply the next bit of the two's-complement DC value */
-    CHECK_BIT_BUFFER(br_state, 1, return FALSE);
-    if (GET_BITS(1))
-      (*block)[0] |= p1;
-    /* Note: since we use |=, repeating the assignment later is safe */
-  }
-
-  /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-}
-
-
-/*
- * MCU decoding for AC successive approximation refinement scan.
- */
-
-METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
-{
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  int Se = cinfo->Se;
-  int p1 = 1 << cinfo->Al;	/* 1 in the bit position being coded */
-  int m1 = (-1) << cinfo->Al;	/* -1 in the bit position being coded */
-  register int s, k, r;
-  unsigned int EOBRUN;
-  JBLOCKROW block;
-  JCOEFPTR thiscoef;
-  BITREAD_STATE_VARS;
-  d_derived_tbl * tbl;
-  int num_newnz;
-  int newnz_pos[DCTSIZE2];
-
-  /* Process restart marker if needed; may have to suspend */
-  if (cinfo->restart_interval) {
-    if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
-    return FALSE;
-  }
-
-  /* If we've run out of data, don't modify the MCU.
-   */
-  if (! entropy->pub.insufficient_data) {
-
-    /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
-
-    /* There is always only one block per MCU */
-    block = MCU_data[0];
-    tbl = entropy->ac_derived_tbl;
-
-    /* If we are forced to suspend, we must undo the assignments to any newly
-     * nonzero coefficients in the block, because otherwise we'd get confused
-     * next time about which coefficients were already nonzero.
-     * But we need not undo addition of bits to already-nonzero coefficients;
-     * instead, we can test the current bit to see if we already did it.
-     */
-    num_newnz = 0;
-
-    /* initialize coefficient loop counter to start of band */
-    k = cinfo->Ss;
-
-    if (EOBRUN == 0) {
-      for (; k <= Se; k++) {
-    HUFF_DECODE(s, br_state, tbl, goto undoit, label3);
-    r = s >> 4;
-    s &= 15;
-    if (s) {
-      if (s != 1)		/* size of new coef should always be 1 */
-        WARNMS(cinfo, JWRN_HUFF_BAD_CODE);
-      CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-      if (GET_BITS(1))
-        s = p1;		/* newly nonzero coef is positive */
-      else
-        s = m1;		/* newly nonzero coef is negative */
-    } else {
-      if (r != 15) {
-        EOBRUN = 1 << r;	/* EOBr, run length is 2^r + appended bits */
-        if (r) {
-          CHECK_BIT_BUFFER(br_state, r, goto undoit);
-          r = GET_BITS(r);
-          EOBRUN += r;
-        }
-        break;		/* rest of block is handled by EOB logic */
-      }
-      /* note s = 0 for processing ZRL */
-    }
-    /* Advance over already-nonzero coefs and r still-zero coefs,
-     * appending correction bits to the nonzeroes.  A correction bit is 1
-     * if the absolute value of the coefficient must be increased.
-     */
-    do {
-      thiscoef = *block + jpeg_natural_order[k];
-      if (*thiscoef != 0) {
-        CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-        if (GET_BITS(1)) {
-          if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
-        if (*thiscoef >= 0)
-          *thiscoef += p1;
-        else
-          *thiscoef += m1;
-          }
-        }
-      } else {
-        if (--r < 0)
-          break;		/* reached target zero coefficient */
-      }
-      k++;
-    } while (k <= Se);
-    if (s) {
-      int pos = jpeg_natural_order[k];
-      /* Output newly nonzero coefficient */
-      (*block)[pos] = (JCOEF) s;
-      /* Remember its position in case we have to suspend */
-      newnz_pos[num_newnz++] = pos;
-    }
-      }
-    }
-
-    if (EOBRUN > 0) {
-      /* Scan any remaining coefficient positions after the end-of-band
-       * (the last newly nonzero coefficient, if any).  Append a correction
-       * bit to each already-nonzero coefficient.  A correction bit is 1
-       * if the absolute value of the coefficient must be increased.
-       */
-      for (; k <= Se; k++) {
-    thiscoef = *block + jpeg_natural_order[k];
-    if (*thiscoef != 0) {
-      CHECK_BIT_BUFFER(br_state, 1, goto undoit);
-      if (GET_BITS(1)) {
-        if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
-          if (*thiscoef >= 0)
-        *thiscoef += p1;
-          else
-        *thiscoef += m1;
-        }
-      }
-    }
-      }
-      /* Count one block completed in EOB run */
-      EOBRUN--;
-    }
-
-    /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
-  }
-
-  /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
-
-  return TRUE;
-
-undoit:
-  /* Re-zero any output coefficients that we made newly nonzero */
-  while (num_newnz > 0)
-    (*block)[newnz_pos[--num_newnz]] = 0;
-
-  return FALSE;
-}
-
-
-/*
- * Module initialization routine for progressive Huffman entropy decoding.
- */
-
-GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
-{
-  phuff_entropy_ptr entropy;
-  int *coef_bit_ptr;
-  int ci, i;
-
-  entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                SIZEOF(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
-  entropy->pub.start_pass = start_pass_phuff_decoder;
-
-  /* Mark derived tables unallocated */
-  for (i = 0; i < NUM_HUFF_TBLS; i++) {
-    entropy->derived_tbls[i] = NULL;
-  }
-
-  /* Create progression status table */
-  cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                cinfo->num_components*DCTSIZE2*SIZEOF(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
-  for (ci = 0; ci < cinfo->num_components; ci++)
-    for (i = 0; i < DCTSIZE2; i++)
-      *coef_bit_ptr++ = -1;
-}
-
-#endif /* D_PROGRESSIVE_SUPPORTED */
--- a/3rdparty/libjpeg/jdsample.c
+++ b/3rdparty/libjpeg/jdsample.c
@@ -2,13 +2,14 @@
 * jdsample.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2002-2008 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains upsampling routines.
 *
 * Upsampling input data is counted in "row groups".  A row group
- * is defined to be (v_samp_factor * DCT_scaled_size / min_DCT_scaled_size)
+ * is defined to be (v_samp_factor * DCT_v_scaled_size / min_DCT_v_scaled_size)
 * sample rows of each component.  Upsampling will normally produce
 * max_v_samp_factor pixel rows from each row group (but this could vary
 * if the upsampler is applying a scale factor of its own).
@@ -237,11 +238,11 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  register JSAMPROW inptr, outptr;
  register JSAMPLE invalue;
  JSAMPROW outend;
-  int inrow;
+  int outrow;

-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
+  for (outrow = 0; outrow < cinfo->max_v_samp_factor; outrow++) {
+    inptr = input_data[outrow];
+    outptr = output_data[outrow];
    outend = outptr + cinfo->output_width;
    while (outptr < outend) {
      invalue = *inptr++;	/* don't need GETJSAMPLE() here */
@@ -285,112 +286,6 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 }


-/*
- * Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
- *
- * The upsampling algorithm is linear interpolation between pixel centers,
- * also known as a "triangle filter".  This is a good compromise between
- * speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
- * of the way between input pixel centers.
- *
- * A note about the "bias" calculations: when rounding fractional values to
- * integer, we do not want to always round 0.5 up to the next integer.
- * If we did that, we'd introduce a noticeable bias towards larger values.
- * Instead, this code is arranged so that 0.5 will be rounded up or down at
- * alternate pixel locations (a simple ordered dither pattern).
- */
-
-METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr, outptr;
-  register int invalue;
-  register JDIMENSION colctr;
-  int inrow;
-
-  for (inrow = 0; inrow < cinfo->max_v_samp_factor; inrow++) {
-    inptr = input_data[inrow];
-    outptr = output_data[inrow];
-    /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
-
-    for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-      /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
-    }
-
-    /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
-  }
-}
-
-
-/*
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- * Again a triangle filter; see comments for h2v1 case, above.
- *
- * It is OK for us to reference the adjacent input rows because we demanded
- * context from the main buffer controller (see initialization code).
- */
-
-METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  register JSAMPROW inptr0, inptr1, outptr;
-#if BITS_IN_JSAMPLE == 8
-  register int thiscolsum, lastcolsum, nextcolsum;
-#else
-  register INT32 thiscolsum, lastcolsum, nextcolsum;
-#endif
-  register JDIMENSION colctr;
-  int inrow, outrow, v;
-
-  inrow = outrow = 0;
-  while (outrow < cinfo->max_v_samp_factor) {
-    for (v = 0; v < 2; v++) {
-      /* inptr0 points to nearest input row, inptr1 points to next nearest */
-      inptr0 = input_data[inrow];
-      if (v == 0)		/* next nearest is row above */
-    inptr1 = input_data[inrow-1];
-      else			/* next nearest is row below */
-    inptr1 = input_data[inrow+1];
-      outptr = output_data[outrow++];
-
-      /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-
-      for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
-    /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
-    /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-    nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-    *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-    *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-    lastcolsum = thiscolsum; thiscolsum = nextcolsum;
-      }
-
-      /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
-    }
-    inrow++;
-  }
-}
-
-
 /*
 * Module initialization routine for upsampling.
 */
@@ -401,7 +296,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
  my_upsample_ptr upsample;
  int ci;
  jpeg_component_info * compptr;
-  boolean need_buffer, do_fancy;
+  boolean need_buffer;
  int h_in_group, v_in_group, h_out_group, v_out_group;

  upsample = (my_upsample_ptr)
@@ -415,11 +310,6 @@ jinit_upsampler (j_decompress_ptr cinfo)
  if (cinfo->CCIR601_sampling)	/* this isn't supported */
    ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);

-  /* jdmainct.c doesn't support context rows when min_DCT_scaled_size = 1,
-   * so don't ask for it.
-   */
-  do_fancy = cinfo->do_fancy_upsampling && cinfo->min_DCT_scaled_size > 1;
-
  /* Verify we can handle the sampling factors, select per-component methods,
   * and create storage as needed.
   */
@@ -428,10 +318,10 @@ jinit_upsampler (j_decompress_ptr cinfo)
    /* Compute size of an "input group" after IDCT scaling.  This many samples
     * are to be converted to max_h_samp_factor * max_v_samp_factor pixels.
     */
-    h_in_group = (compptr->h_samp_factor * compptr->DCT_scaled_size) /
-         cinfo->min_DCT_scaled_size;
-    v_in_group = (compptr->v_samp_factor * compptr->DCT_scaled_size) /
-         cinfo->min_DCT_scaled_size;
+    h_in_group = (compptr->h_samp_factor * compptr->DCT_h_scaled_size) /
+                 cinfo->min_DCT_h_scaled_size;
+    v_in_group = (compptr->v_samp_factor * compptr->DCT_v_scaled_size) /
+                 cinfo->min_DCT_v_scaled_size;
    h_out_group = cinfo->max_h_samp_factor;
    v_out_group = cinfo->max_v_samp_factor;
    upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
@@ -446,18 +336,11 @@ jinit_upsampler (j_decompress_ptr cinfo)
      need_buffer = FALSE;
    } else if (h_in_group * 2 == h_out_group &&
               v_in_group == v_out_group) {
-      /* Special cases for 2h1v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2)
-    upsample->methods[ci] = h2v1_fancy_upsample;
-      else
+      /* Special case for 2h1v upsampling */
      upsample->methods[ci] = h2v1_upsample;
    } else if (h_in_group * 2 == h_out_group &&
               v_in_group * 2 == v_out_group) {
-      /* Special cases for 2h2v upsampling */
-      if (do_fancy && compptr->downsampled_width > 2) {
-    upsample->methods[ci] = h2v2_fancy_upsample;
-    upsample->pub.need_context_rows = TRUE;
-      } else
+      /* Special case for 2h2v upsampling */
      upsample->methods[ci] = h2v2_upsample;
    } else if ((h_out_group % h_in_group) == 0 &&
               (v_out_group % v_in_group) == 0) {
--- a/3rdparty/libjpeg/jdtrans.c
+++ b/3rdparty/libjpeg/jdtrans.c
@@ -2,6 +2,7 @@
 * jdtrans.c
 *
 * Copyright (C) 1995-1997, Thomas G. Lane.
+ * Modified 2000-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -99,17 +100,13 @@ transdecode_master_selection (j_decompress_ptr cinfo)
  /* This is effectively a buffered-image operation. */
  cinfo->buffered_image = TRUE;

+  /* Compute output image dimensions and related values. */
+  jpeg_core_output_dimensions(cinfo);
+
  /* Entropy decoding: either Huffman or arithmetic coding. */
-  if (cinfo->arith_code) {
-    ERREXIT(cinfo, JERR_ARITH_NOTIMPL);
-  } else {
-    if (cinfo->progressive_mode) {
-#ifdef D_PROGRESSIVE_SUPPORTED
-      jinit_phuff_decoder(cinfo);
-#else
-      ERREXIT(cinfo, JERR_NOT_COMPILED);
-#endif
-    } else
+  if (cinfo->arith_code)
+    jinit_arith_decoder(cinfo);
+  else {
    jinit_huff_decoder(cinfo);
  }

--- a/3rdparty/libjpeg/jerror.c
+++ b/3rdparty/libjpeg/jerror.c
@@ -2,6 +2,7 @@
 * jerror.c
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -23,7 +24,6 @@
 #include "jpeglib.h"
 #include "jversion.h"
 #include "jerror.h"
-#include <stdlib.h>

 #ifdef USE_WINDOWS_MESSAGEBOX
 #include <windows.h>
@@ -67,7 +67,7 @@ const char * const jpeg_std_message_table[] = {
 * or jpeg_destroy) at some point.
 */

-METHODDEF(void)
+METHODDEF(noreturn_t)
 error_exit (j_common_ptr cinfo)
 {
  /* Always display the message */
--- a/3rdparty/libjpeg/jerror.h
+++ b/3rdparty/libjpeg/jerror.h
@@ -2,6 +2,7 @@
 * jerror.h
 *
 * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 1997-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -39,15 +40,15 @@ typedef enum {
 JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */

 /* For maintenance convenience, list is alphabetical by message code name */
-JMESSAGE(JERR_ARITH_NOTIMPL,
-     "Sorry, there are legal restrictions on arithmetic coding")
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
 JMESSAGE(JERR_BAD_BUFFER_MODE, "Bogus buffer control mode")
 JMESSAGE(JERR_BAD_COMPONENT_ID, "Invalid component ID %d in SOS")
 JMESSAGE(JERR_BAD_CROP_SPEC, "Invalid crop request")
 JMESSAGE(JERR_BAD_DCT_COEF, "DCT coefficient out of range")
-JMESSAGE(JERR_BAD_DCTSIZE, "IDCT output block size %d not supported")
+JMESSAGE(JERR_BAD_DCTSIZE, "DCT scaled block size %dx%d not supported")
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
 JMESSAGE(JERR_BAD_HUFF_TABLE, "Bogus Huffman table definition")
 JMESSAGE(JERR_BAD_IN_COLORSPACE, "Bogus input colorspace")
 JMESSAGE(JERR_BAD_J_COLORSPACE, "Bogus JPEG colorspace")
@@ -94,6 +95,7 @@ JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
 JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
+JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JERR_NO_BACKING_STORE, "Backing store not supported")
 JMESSAGE(JERR_NO_HUFF_TABLE, "Huffman table 0x%02x was not defined")
 JMESSAGE(JERR_NO_IMAGE, "JPEG datastream contains no image")
@@ -104,11 +106,11 @@ JMESSAGE(JERR_QUANT_COMPONENTS,
         "Cannot quantize more than %d color components")
 JMESSAGE(JERR_QUANT_FEW_COLORS, "Cannot quantize to fewer than %d colors")
 JMESSAGE(JERR_QUANT_MANY_COLORS, "Cannot quantize to more than %d colors")
+JMESSAGE(JERR_SOF_BEFORE, "Invalid JPEG file structure: %s before SOF")
 JMESSAGE(JERR_SOF_DUPLICATE, "Invalid JPEG file structure: two SOF markers")
 JMESSAGE(JERR_SOF_NO_SOS, "Invalid JPEG file structure: missing SOS marker")
 JMESSAGE(JERR_SOF_UNSUPPORTED, "Unsupported JPEG process: SOF type 0x%02x")
 JMESSAGE(JERR_SOI_DUPLICATE, "Invalid JPEG file structure: two SOI markers")
-JMESSAGE(JERR_SOS_NO_SOF, "Invalid JPEG file structure: SOS before SOF")
 JMESSAGE(JERR_TFILE_CREATE, "Failed to create temporary file %s")
 JMESSAGE(JERR_TFILE_READ, "Read failed on temporary file")
 JMESSAGE(JERR_TFILE_SEEK, "Seek failed on temporary file")
@@ -171,6 +173,7 @@ JMESSAGE(JTRC_UNKNOWN_IDS,
 JMESSAGE(JTRC_XMS_CLOSE, "Freed XMS handle %u")
 JMESSAGE(JTRC_XMS_OPEN, "Obtained XMS handle %u")
 JMESSAGE(JWRN_ADOBE_XFORM, "Unknown Adobe color transform code %d")
+JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 JMESSAGE(JWRN_BOGUS_PROGRESSION,
         "Inconsistent progression sequence for component %d coefficient %d")
 JMESSAGE(JWRN_EXTRANEOUS_DATA,
@@ -228,6 +231,15 @@ JMESSAGE(JWRN_TOO_MUCH_DATA, "Application transferred too many scanlines")
   (cinfo)->err->msg_parm.i[2] = (p3), \
   (cinfo)->err->msg_parm.i[3] = (p4), \
   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+#define ERREXIT6(cinfo,code,p1,p2,p3,p4,p5,p6)  \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
 #define ERREXITS(cinfo,code,str)  \
  ((cinfo)->err->msg_code = (code), \
   strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
--- a/3rdparty/libjpeg/jfdctflt.c
+++ b/3rdparty/libjpeg/jfdctflt.c
@@ -2,6 +2,7 @@
 * jfdctflt.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -56,26 +57,30 @@
 */

 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT * data)
+jpeg_fdct_float (FAST_FLOAT * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
  FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
  FAST_FLOAT z1, z2, z3, z4, z5, z11, z13;
  FAST_FLOAT *dataptr;
+  JSAMPROW elemptr;
  int ctr;

  /* Pass 1: process rows. */

  dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]));
+    tmp7 = (FAST_FLOAT) (GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]));
+    tmp1 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]));
+    tmp6 = (FAST_FLOAT) (GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]));
+    tmp2 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]));
+    tmp5 = (FAST_FLOAT) (GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]));
+    tmp3 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]));
+    tmp4 = (FAST_FLOAT) (GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]));

    /* Even part */

@@ -84,7 +89,8 @@ jpeg_fdct_float (FAST_FLOAT * data)
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
    dataptr[4] = tmp10 - tmp11;

    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
--- a/3rdparty/libjpeg/jfdctfst.c
+++ b/3rdparty/libjpeg/jfdctfst.c
@@ -2,6 +2,7 @@
 * jfdctfst.c
 *
 * Copyright (C) 1994-1996, Thomas G. Lane.
+ * Modified 2003-2009 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -111,27 +112,31 @@
 */

 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM * data)
+jpeg_fdct_ifast (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
 {
  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  DCTELEM tmp10, tmp11, tmp12, tmp13;
  DCTELEM z1, z2, z3, z4, z5, z11, z13;
  DCTELEM *dataptr;
+  JSAMPROW elemptr;
  int ctr;
  SHIFT_TEMPS

  /* Pass 1: process rows. */

  dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[0] + dataptr[7];
-    tmp7 = dataptr[0] - dataptr[7];
-    tmp1 = dataptr[1] + dataptr[6];
-    tmp6 = dataptr[1] - dataptr[6];
-    tmp2 = dataptr[2] + dataptr[5];
-    tmp5 = dataptr[2] - dataptr[5];
-    tmp3 = dataptr[3] + dataptr[4];
-    tmp4 = dataptr[3] - dataptr[4];
+  for (ctr = 0; ctr < DCTSIZE; ctr++) {
+    elemptr = sample_data[ctr] + start_col;
+
+    /* Load data into workspace */
+    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
+    tmp7 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
+    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
+    tmp6 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
+    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
+    tmp5 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
+    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
+    tmp4 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);

    /* Even part */

@@ -140,7 +145,8 @@ jpeg_fdct_ifast (DCTELEM * data)
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

-    dataptr[0] = tmp10 + tmp11; /* phase 3 */
+    /* Apply unsigned->signed conversion */
+    dataptr[0] = tmp10 + tmp11 - 8 * CENTERJSAMPLE; /* phase 3 */
    dataptr[4] = tmp10 - tmp11;

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
--- a/3rdparty/libjpeg/jfdctint.c
+++ b/3rdparty/libjpeg/jfdctint.c
--- a/3rdparty/libjpeg/jidctflt.c
+++ b/3rdparty/libjpeg/jidctflt.c
@@ -2,6 +2,7 @@
 * jidctflt.c
 *
 * Copyright (C) 1994-1998, Thomas G. Lane.
+ * Modified 2010 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -76,10 +77,9 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  FLOAT_MULT_TYPE * quantptr;
  FAST_FLOAT * wsptr;
  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
+  JSAMPLE *range_limit = cinfo->sample_range_limit;
  int ctr;
  FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  SHIFT_TEMPS

  /* Pass 1: process columns from input, store into work array. */

@@ -152,12 +152,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */

    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */

    tmp6 = tmp12 - tmp7;	/* phase 2 */
    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;

    wsptr[DCTSIZE*0] = tmp0 + tmp7;
    wsptr[DCTSIZE*7] = tmp0 - tmp7;
@@ -165,8 +165,8 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    wsptr[DCTSIZE*6] = tmp1 - tmp6;
    wsptr[DCTSIZE*2] = tmp2 + tmp5;
    wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*4] = tmp3 + tmp4;
-    wsptr[DCTSIZE*3] = tmp3 - tmp4;
+    wsptr[DCTSIZE*3] = tmp3 + tmp4;
+    wsptr[DCTSIZE*4] = tmp3 - tmp4;

    inptr++;			/* advance pointers to next column */
    quantptr++;
@@ -174,7 +174,6 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  }

  /* Pass 2: process rows from work array, store into output array. */
-  /* Note that we must descale the results by a factor of 8 == 2**3. */

  wsptr = workspace;
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
@@ -187,8 +186,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,

    /* Even part */

-    tmp10 = wsptr[0] + wsptr[4];
-    tmp11 = wsptr[0] - wsptr[4];
+    /* Apply signed->unsigned and prepare float->int conversion */
+    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    tmp10 = z5 + wsptr[4];
+    tmp11 = z5 - wsptr[4];

    tmp13 = wsptr[2] + wsptr[6];
    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
@@ -209,31 +210,23 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);

    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */
-    tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */
+    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */

    tmp6 = tmp12 - tmp7;
    tmp5 = tmp11 - tmp6;
-    tmp4 = tmp10 + tmp5;
+    tmp4 = tmp10 - tmp5;

-    /* Final output stage: scale down by a factor of 8 and range-limit */
+    /* Final output stage: float->int conversion and range-limit */

-    outptr[0] = range_limit[(int) DESCALE((INT32) (tmp0 + tmp7), 3)
-                & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE((INT32) (tmp0 - tmp7), 3)
-                & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE((INT32) (tmp1 + tmp6), 3)
-                & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE((INT32) (tmp1 - tmp6), 3)
-                & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE((INT32) (tmp2 + tmp5), 3)
-                & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE((INT32) (tmp2 - tmp5), 3)
-                & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE((INT32) (tmp3 + tmp4), 3)
-                & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE((INT32) (tmp3 - tmp4), 3)
-                & RANGE_MASK];
+    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];

    wsptr += DCTSIZE;		/* advance pointer to next row */
  }
--- a/3rdparty/libjpeg/jidctint.c
+++ b/3rdparty/libjpeg/jidctint.c
--- a/3rdparty/libjpeg/jidctred.c
+++ b/3rdparty/libjpeg/jidctred.c
@@ -1,398 +0,0 @@
-/*
- * jidctred.c
- *
- * Copyright (C) 1994-1998, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains inverse-DCT routines that produce reduced-size output:
- * either 4x4, 2x2, or 1x1 pixels from an 8x8 DCT block.
- *
- * The implementation is based on the Loeffler, Ligtenberg and Moschytz (LL&M)
- * algorithm used in jidctint.c.  We simply replace each 8-to-8 1-D IDCT step
- * with an 8-to-4 step that produces the four averages of two adjacent outputs
- * (or an 8-to-2 step producing two averages of four outputs, for 2x2 output).
- * These steps were derived by computing the corresponding values at the end
- * of the normal LL&M code, then simplifying as much as possible.
- *
- * 1x1 is trivial: just take the DC coefficient divided by 8.
- *
- * See jidctint.c for additional comments.
- */
-
-#define JPEG_INTERNALS
-#include "jinclude.h"
-#include "jpeglib.h"
-#include "jdct.h"		/* Private declarations for DCT subsystem */
-
-#ifdef IDCT_SCALING_SUPPORTED
-
-
-/*
- * This module is specialized to the case DCTSIZE = 8.
- */
-
-#if DCTSIZE != 8
-  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
-#endif
-
-
-/* Scaling is the same as in jidctint.c. */
-
-#if BITS_IN_JSAMPLE == 8
-#define CONST_BITS  13
-#define PASS1_BITS  2
-#else
-#define CONST_BITS  13
-#define PASS1_BITS  1		/* lose a little precision to avoid overflow */
-#endif
-
-/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
- * causing a lot of useless floating-point operations at run time.
- * To get around this we use the following pre-calculated constants.
- * If you change CONST_BITS you may want to add appropriate values.
- * (With a reasonable C compiler, you can just rely on the FIX() macro...)
- */
-
-#if CONST_BITS == 13
-#define FIX_0_211164243  ((INT32)  1730)	/* FIX(0.211164243) */
-#define FIX_0_509795579  ((INT32)  4176)	/* FIX(0.509795579) */
-#define FIX_0_601344887  ((INT32)  4926)	/* FIX(0.601344887) */
-#define FIX_0_720959822  ((INT32)  5906)	/* FIX(0.720959822) */
-#define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
-#define FIX_0_850430095  ((INT32)  6967)	/* FIX(0.850430095) */
-#define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
-#define FIX_1_061594337  ((INT32)  8697)	/* FIX(1.061594337) */
-#define FIX_1_272758580  ((INT32)  10426)	/* FIX(1.272758580) */
-#define FIX_1_451774981  ((INT32)  11893)	/* FIX(1.451774981) */
-#define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
-#define FIX_2_172734803  ((INT32)  17799)	/* FIX(2.172734803) */
-#define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
-#define FIX_3_624509785  ((INT32)  29692)	/* FIX(3.624509785) */
-#else
-#define FIX_0_211164243  FIX(0.211164243)
-#define FIX_0_509795579  FIX(0.509795579)
-#define FIX_0_601344887  FIX(0.601344887)
-#define FIX_0_720959822  FIX(0.720959822)
-#define FIX_0_765366865  FIX(0.765366865)
-#define FIX_0_850430095  FIX(0.850430095)
-#define FIX_0_899976223  FIX(0.899976223)
-#define FIX_1_061594337  FIX(1.061594337)
-#define FIX_1_272758580  FIX(1.272758580)
-#define FIX_1_451774981  FIX(1.451774981)
-#define FIX_1_847759065  FIX(1.847759065)
-#define FIX_2_172734803  FIX(2.172734803)
-#define FIX_2_562915447  FIX(2.562915447)
-#define FIX_3_624509785  FIX(3.624509785)
-#endif
-
-
-/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
- * For 8-bit samples with the recommended scaling, all the variable
- * and constant values involved are no more than 16 bits wide, so a
- * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
- * For 12-bit samples, a full 32-bit multiplication will be needed.
- */
-
-#if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
-#else
-#define MULTIPLY(var,const)  ((var) * (const))
-#endif
-
-
-/* Dequantize a coefficient by multiplying it by the multiplier-table
- * entry; produce an int result.  In this module, both inputs and result
- * are 16 bits or less, so either int or short multiply will work.
- */
-
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 4x4 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-           JCOEFPTR coef_block,
-           JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp2, tmp10, tmp12;
-  INT32 z1, z2, z3, z4;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*4];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-    inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-    inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-
-      continue;
-    }
-
-    /* Even part */
-
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 <<= (CONST_BITS+1);
-
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
-
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
-
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-
-    /* Odd part */
-
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-     + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-     + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-     + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-     + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-     + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-     + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
-  }
-
-  /* Pass 2: process 4 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 4; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
-    wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-                  & RANGE_MASK];
-
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-      outptr[2] = dcval;
-      outptr[3] = dcval;
-
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-
-    /* Even part */
-
-    tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1);
-
-    tmp2 = MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
-     + MULTIPLY((INT32) wsptr[6], - FIX_0_765366865);
-
-    tmp10 = tmp0 + tmp2;
-    tmp12 = tmp0 - tmp2;
-
-    /* Odd part */
-
-    z1 = (INT32) wsptr[7];
-    z2 = (INT32) wsptr[5];
-    z3 = (INT32) wsptr[3];
-    z4 = (INT32) wsptr[1];
-
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-     + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-     + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-     + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
-
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-     + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-     + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-     + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
-
-    /* Final output stage */
-
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-                      CONST_BITS+PASS1_BITS+3+1)
-                & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-                      CONST_BITS+PASS1_BITS+3+1)
-                & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-                      CONST_BITS+PASS1_BITS+3+1)
-                & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-                      CONST_BITS+PASS1_BITS+3+1)
-                & RANGE_MASK];
-
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 2x2 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-           JCOEFPTR coef_block,
-           JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  INT32 tmp0, tmp10, z1;
-  JCOEFPTR inptr;
-  ISLOW_MULT_TYPE * quantptr;
-  int * wsptr;
-  JSAMPROW outptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  int ctr;
-  int workspace[DCTSIZE*2];	/* buffers data between passes */
-  SHIFT_TEMPS
-
-  /* Pass 1: process columns from input, store into work array. */
-
-  inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  wsptr = workspace;
-  for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
-    /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
-      continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-    inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
-      /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
-
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-
-      continue;
-    }
-
-    /* Even part */
-
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = z1 << (CONST_BITS+2);
-
-    /* Odd part */
-
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
-  }
-
-  /* Pass 2: process 2 rows from work array, store into output array. */
-
-  wsptr = workspace;
-  for (ctr = 0; ctr < 2; ctr++) {
-    outptr = output_buf[ctr] + output_col;
-    /* It's not clear whether a zero row test is worthwhile here ... */
-
-#ifndef NO_ZERO_ROW_TEST
-    if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
-      /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
-                  & RANGE_MASK];
-
-      outptr[0] = dcval;
-      outptr[1] = dcval;
-
-      wsptr += DCTSIZE;		/* advance pointer to next row */
-      continue;
-    }
-#endif
-
-    /* Even part */
-
-    tmp10 = ((INT32) wsptr[0]) << (CONST_BITS+2);
-
-    /* Odd part */
-
-    tmp0 = MULTIPLY((INT32) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-     + MULTIPLY((INT32) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-     + MULTIPLY((INT32) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-     + MULTIPLY((INT32) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
-
-    /* Final output stage */
-
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-                      CONST_BITS+PASS1_BITS+3+2)
-                & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-                      CONST_BITS+PASS1_BITS+3+2)
-                & RANGE_MASK];
-
-    wsptr += DCTSIZE;		/* advance pointer to next row */
-  }
-}
-
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a reduced-size 1x1 output block.
- */
-
-GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
-           JCOEFPTR coef_block,
-           JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  int dcval;
-  ISLOW_MULT_TYPE * quantptr;
-  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
-  SHIFT_TEMPS
-
-  /* We hardly need an inverse DCT routine for this: just take the
-   * average pixel value, which is one-eighth of the DC coefficient.
-   */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
-  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((INT32) dcval, 3);
-
-  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
-}
-
-#endif /* IDCT_SCALING_SUPPORTED */
--- a/3rdparty/libjpeg/jmemmgr.c
+++ b/3rdparty/libjpeg/jmemmgr.c
@@ -2,6 +2,7 @@
 * jmemmgr.c
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2011-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -36,9 +37,6 @@ extern char * getenv JPP((const char * name));
 #endif
 #endif

-#if defined _MSC_VER && _MSC_VER >= 1400
-#pragma warning(disable: 4267)
-#endif

 /*
 * Some important notes:
@@ -216,7 +214,7 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 #endif /* MEM_STATS */


-LOCAL(void)
+LOCAL(noreturn_t)
 out_of_memory (j_common_ptr cinfo, int which)
 /* Report an out-of-memory error and stop execution */
 /* If we compiled MEM_STATS support, report alloc requests before dying */
@@ -824,7 +822,7 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
      undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
      end_row -= ptr->cur_start_row;
      while (undef_row < end_row) {
-    jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
+        FMEMZERO((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
        undef_row++;
      }
    } else {
@@ -909,7 +907,7 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
      undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
      end_row -= ptr->cur_start_row;
      while (undef_row < end_row) {
-    jzero_far((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
+        FMEMZERO((void FAR *) ptr->mem_buffer[undef_row], bytesperrow);
        undef_row++;
      }
    } else {
--- a/3rdparty/libjpeg/jmemnobs.c
+++ b/3rdparty/libjpeg/jmemnobs.c
@@ -0,0 +1,109 @@
+/*
+ * jmemnobs.c
+ *
+ * Copyright (C) 1992-1996, Thomas G. Lane.
+ * This file is part of the Independent JPEG Group's software.
+ * For conditions of distribution and use, see the accompanying README file.
+ *
+ * This file provides a really simple implementation of the system-
+ * dependent portion of the JPEG memory manager.  This implementation
+ * assumes that no backing-store files are needed: all required space
+ * can be obtained from malloc().
+ * This is very portable in the sense that it'll compile on almost anything,
+ * but you'd better have lots of main memory (or virtual memory) if you want
+ * to process big images.
+ * Note that the max_memory_to_use option is ignored by this implementation.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jmemsys.h"		/* import the system-dependent declarations */
+
+#ifndef HAVE_STDLIB_H		/* <stdlib.h> should declare malloc(),free() */
+extern void * malloc JPP((size_t size));
+extern void free JPP((void *ptr));
+#endif
+
+
+/*
+ * Memory allocation and freeing are controlled by the regular library
+ * routines malloc() and free().
+ */
+
+GLOBAL(void *)
+jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_small (j_common_ptr cinfo, void * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * "Large" objects are treated the same as "small" ones.
+ * NB: although we include FAR keywords in the routine declarations,
+ * this file won't actually work in 80x86 small/medium model; at least,
+ * you probably won't be able to process useful-size images in only 64KB.
+ */
+
+GLOBAL(void FAR *)
+jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+{
+  return (void FAR *) malloc(sizeofobject);
+}
+
+GLOBAL(void)
+jpeg_free_large (j_common_ptr cinfo, void FAR * object, size_t sizeofobject)
+{
+  free(object);
+}
+
+
+/*
+ * This routine computes the total memory space available for allocation.
+ * Here we always say, "we got all you want bud!"
+ */
+
+GLOBAL(long)
+jpeg_mem_available (j_common_ptr cinfo, long min_bytes_needed,
+                    long max_bytes_needed, long already_allocated)
+{
+  return max_bytes_needed;
+}
+
+
+/*
+ * Backing store (temporary file) management.
+ * Since jpeg_mem_available always promised the moon,
+ * this should never be called and we can just error out.
+ */
+
+GLOBAL(void)
+jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
+                         long total_bytes_needed)
+{
+  ERREXIT(cinfo, JERR_NO_BACKING_STORE);
+}
+
+
+/*
+ * These routines take care of any system-dependent initialization and
+ * cleanup required.  Here, there isn't any.
+ */
+
+GLOBAL(long)
+jpeg_mem_init (j_common_ptr cinfo)
+{
+  return 0;			/* just set max_memory_to_use to 0 */
+}
+
+GLOBAL(void)
+jpeg_mem_term (j_common_ptr cinfo)
+{
+  /* no work */
+}
--- a/3rdparty/libjpeg/jmorecfg.h
+++ b/3rdparty/libjpeg/jmorecfg.h
@@ -2,6 +2,7 @@
 * jmorecfg.h
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -158,8 +159,14 @@ typedef short INT16;
 /* INT32 must hold at least signed 32-bit values. */

 #ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
+#ifndef _BASETSD_H_		/* Microsoft defines it in basetsd.h */
+#ifndef _BASETSD_H		/* MinGW is slightly different */
+#ifndef QGLOBAL_H		/* Qt defines it in qglobal.h */
 typedef long INT32;
 #endif
+#endif
+#endif
+#endif

 /* Datatype used for image dimensions.  The JPEG standard only supports
 * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
@@ -203,17 +210,39 @@ typedef unsigned int JDIMENSION;
 #endif


+/* The noreturn type identifier is used to declare functions
+ * which cannot return.
+ * Compilers can thus create more optimized code and perform
+ * better checks for warnings and errors.
+ * Static analyzer tools can make improved inferences about
+ * execution paths and are prevented from giving false alerts.
+ *
+ * Unfortunately, the proposed specifications of corresponding
+ * extensions in the Dec 2011 ISO C standard revision (C11),
+ * GCC, MSVC, etc. are not viable.
+ * Thus we introduce a user defined type to declare noreturn
+ * functions at least for clarity.  A proper compiler would
+ * have a suitable noreturn type to match in place of void.
+ */
+
+#ifndef HAVE_NORETURN_T
+typedef void noreturn_t;
+#endif
+
+
 /* Here is the pseudo-keyword for declaring pointers that must be "far"
 * on 80x86 machines.  Most of the specialized coding for 80x86 is handled
 * by just saying "FAR *" where such a pointer is needed.  In a few places
 * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
 */

+#ifndef FAR
 #ifdef NEED_FAR_POINTERS
 #define FAR  far
 #else
 #define FAR
 #endif
+#endif


 /*
@@ -223,15 +252,16 @@ typedef unsigned int JDIMENSION;
 * Defining HAVE_BOOLEAN before including jpeglib.h should make it work.
 */

-#ifndef HAVE_BOOLEAN
-typedef int boolean;
-#endif
+#ifdef HAVE_BOOLEAN
 #ifndef FALSE			/* in case these macros already exist */
 #define FALSE	0		/* values of boolean */
 #endif
 #ifndef TRUE
 #define TRUE	1
 #endif
+#else
+typedef enum { FALSE = 0, TRUE = 1 } boolean;
+#endif


 /*
@@ -256,8 +286,6 @@ typedef int boolean;
 * (You may HAVE to do that if your compiler doesn't like null source files.)
 */

-/* Arithmetic coding is unsupported for legal reasons.  Complaints to IBM. */
-
 /* Capability options common to encoder and decoder: */

 #define DCT_ISLOW_SUPPORTED	/* slow but accurate integer algorithm */
@@ -266,9 +294,10 @@ typedef int boolean;

 /* Encoder capability options: */

-#undef  C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define C_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define C_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define C_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define DCT_SCALING_SUPPORTED	    /* Input rescaling via DCT? (Requires DCT_ISLOW)*/
 #define ENTROPY_OPT_SUPPORTED	    /* Optimization of entropy coding parms? */
 /* Note: if you selected 12-bit data precision, it is dangerous to turn off
 * ENTROPY_OPT_SUPPORTED.  The standard Huffman tables are only good for 8-bit
@@ -282,12 +311,12 @@ typedef int boolean;

 /* Decoder capability options: */

-#undef  D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
+#define D_ARITH_CODING_SUPPORTED    /* Arithmetic coding back end? */
 #define D_MULTISCAN_FILES_SUPPORTED /* Multiple-scan JPEG files? */
 #define D_PROGRESSIVE_SUPPORTED	    /* Progressive JPEG? (Requires MULTISCAN)*/
+#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #define SAVE_MARKERS_SUPPORTED	    /* jpeg_save_markers() needed? */
 #define BLOCK_SMOOTHING_SUPPORTED   /* Block smoothing? (Progressive only) */
-#define IDCT_SCALING_SUPPORTED	    /* Output rescaling via IDCT? */
 #undef  UPSAMPLE_SCALING_SUPPORTED  /* Output rescaling at upsample stage? */
 #define UPSAMPLE_MERGING_SUPPORTED  /* Fast path for sloppy upsampling? */
 #define QUANT_1PASS_SUPPORTED	    /* 1-pass color quantization? */
@@ -304,9 +333,7 @@ typedef int boolean;
 * the offsets will also change the order in which colormap data is organized.
 * RESTRICTIONS:
 * 1. The sample applications cjpeg,djpeg do NOT support modified RGB formats.
- * 2. These macros only affect RGB<=>YCbCr color conversion, so they are not
- *    useful if you are using JPEG color spaces other than YCbCr or grayscale.
- * 3. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
+ * 2. The color quantizer modules will not behave desirably if RGB_PIXELSIZE
 *    is not 3 (they don't understand about dummy color components!).  So you
 *    can't use color quantization if you change that value.
 */
--- a/3rdparty/libjpeg/jpegint.h
+++ b/3rdparty/libjpeg/jpegint.h
@@ -2,6 +2,7 @@
 * jpegint.h
 *
 * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 1997-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -99,14 +100,16 @@ struct jpeg_downsampler {
 };

 /* Forward DCT (also controls coefficient quantization) */
-struct jpeg_forward_dct {
-  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
-  /* perhaps this should be an array??? */
-  JMETHOD(void, forward_DCT, (j_compress_ptr cinfo,
-                  jpeg_component_info * compptr,
+typedef JMETHOD(void, forward_DCT_ptr,
+                (j_compress_ptr cinfo, jpeg_component_info * compptr,
                 JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
                 JDIMENSION start_row, JDIMENSION start_col,
                 JDIMENSION num_blocks));
+
+struct jpeg_forward_dct {
+  JMETHOD(void, start_pass, (j_compress_ptr cinfo));
+  /* It is useful to allow each component to have a separate FDCT method. */
+  forward_DCT_ptr forward_DCT[MAX_COMPONENTS];
 };

 /* Entropy encoding */
@@ -210,10 +213,6 @@ struct jpeg_entropy_decoder {
  JMETHOD(void, start_pass, (j_decompress_ptr cinfo));
  JMETHOD(boolean, decode_mcu, (j_decompress_ptr cinfo,
                                JBLOCKROW *MCU_data));
-
-  /* This is here to share code between baseline and progressive decoders; */
-  /* other modules probably should not use it */
-  boolean insufficient_data;	/* set TRUE after emitting warning */
 };

 /* Inverse DCT (also performs dequantization) */
@@ -303,7 +302,7 @@ struct jpeg_color_quantizer {
 #define jinit_downsampler	jIDownsampler
 #define jinit_forward_dct	jIFDCT
 #define jinit_huff_encoder	jIHEncoder
-#define jinit_phuff_encoder	jIPHEncoder
+#define jinit_arith_encoder	jIAEncoder
 #define jinit_marker_writer	jIMWriter
 #define jinit_master_decompress	jIDMaster
 #define jinit_d_main_controller	jIDMainC
@@ -312,7 +311,7 @@ struct jpeg_color_quantizer {
 #define jinit_input_controller	jIInCtlr
 #define jinit_marker_reader	jIMReader
 #define jinit_huff_decoder	jIHDecoder
-#define jinit_phuff_decoder	jIPHDecoder
+#define jinit_arith_decoder	jIADecoder
 #define jinit_inverse_dct	jIIDCT
 #define jinit_upsampler		jIUpsampler
 #define jinit_color_deconverter	jIDColor
@@ -322,14 +321,41 @@ struct jpeg_color_quantizer {
 #define jinit_memory_mgr	jIMemMgr
 #define jdiv_round_up		jDivRound
 #define jround_up		jRound
+#define jzero_far		jZeroFar
 #define jcopy_sample_rows	jCopySamples
 #define jcopy_block_row		jCopyBlocks
-#define jzero_far		jZeroFar
 #define jpeg_zigzag_order	jZIGTable
 #define jpeg_natural_order	jZAGTable
+#define jpeg_natural_order7	jZAG7Table
+#define jpeg_natural_order6	jZAG6Table
+#define jpeg_natural_order5	jZAG5Table
+#define jpeg_natural_order4	jZAG4Table
+#define jpeg_natural_order3	jZAG3Table
+#define jpeg_natural_order2	jZAG2Table
+#define jpeg_aritab		jAriTab
 #endif /* NEED_SHORT_EXTERNAL_NAMES */


+/* On normal machines we can apply MEMCOPY() and MEMZERO() to sample arrays
+ * and coefficient-block arrays.  This won't work on 80x86 because the arrays
+ * are FAR and we're assuming a small-pointer memory model.  However, some
+ * DOS compilers provide far-pointer versions of memcpy() and memset() even
+ * in the small-model libraries.  These will be used if USE_FMEM is defined.
+ * Otherwise, the routines in jutils.c do it the hard way.
+ */
+
+#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macro */
+#define FMEMZERO(target,size)	MEMZERO(target,size)
+#else				/* 80x86 case */
+#ifdef USE_FMEM
+#define FMEMZERO(target,size)	_fmemset((void FAR *)(target), 0, (size_t)(size))
+#else
+EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
+#define FMEMZERO(target,size)	jzero_far(target, size)
+#endif
+#endif
+
+
 /* Compression module initialization routines */
 EXTERN(void) jinit_compress_master JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_c_master_control JPP((j_compress_ptr cinfo,
@@ -344,7 +370,7 @@ EXTERN(void) jinit_color_converter JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_downsampler JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_forward_dct JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_huff_encoder JPP((j_compress_ptr cinfo));
-EXTERN(void) jinit_phuff_encoder JPP((j_compress_ptr cinfo));
+EXTERN(void) jinit_arith_encoder JPP((j_compress_ptr cinfo));
 EXTERN(void) jinit_marker_writer JPP((j_compress_ptr cinfo));
 /* Decompression module initialization routines */
 EXTERN(void) jinit_master_decompress JPP((j_decompress_ptr cinfo));
@@ -357,7 +383,7 @@ EXTERN(void) jinit_d_post_controller JPP((j_decompress_ptr cinfo,
 EXTERN(void) jinit_input_controller JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_marker_reader JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_huff_decoder JPP((j_decompress_ptr cinfo));
-EXTERN(void) jinit_phuff_decoder JPP((j_decompress_ptr cinfo));
+EXTERN(void) jinit_arith_decoder JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_inverse_dct JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_upsampler JPP((j_decompress_ptr cinfo));
 EXTERN(void) jinit_color_deconverter JPP((j_decompress_ptr cinfo));
@@ -375,12 +401,20 @@ EXTERN(void) jcopy_sample_rows JPP((JSAMPARRAY input_array, int source_row,
                                    int num_rows, JDIMENSION num_cols));
 EXTERN(void) jcopy_block_row JPP((JBLOCKROW input_row, JBLOCKROW output_row,
                                  JDIMENSION num_blocks));
-EXTERN(void) jzero_far JPP((void FAR * target, size_t bytestozero));
 /* Constant tables in jutils.c */
 #if 0				/* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
 #endif
 extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
+extern const int jpeg_natural_order7[]; /* zz to natural order for 7x7 block */
+extern const int jpeg_natural_order6[]; /* zz to natural order for 6x6 block */
+extern const int jpeg_natural_order5[]; /* zz to natural order for 5x5 block */
+extern const int jpeg_natural_order4[]; /* zz to natural order for 4x4 block */
+extern const int jpeg_natural_order3[]; /* zz to natural order for 3x3 block */
+extern const int jpeg_natural_order2[]; /* zz to natural order for 2x2 block */
+
+/* Arithmetic coding probability estimation tables in jaricom.c */
+extern const INT32 jpeg_aritab[];

 /* Suppress undefined-structure complaints if necessary. */

--- a/3rdparty/libjpeg/jpeglib.h
+++ b/3rdparty/libjpeg/jpeglib.h
@@ -2,6 +2,7 @@
 * jpeglib.h
 *
 * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Modified 2002-2012 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -13,17 +14,6 @@
 #ifndef JPEGLIB_H
 #define JPEGLIB_H

-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* HJH modification: protect compiler options for structure alignment and enum
- * size if the compiler is Borland C++ */
-#ifdef __BORLANDC__
-#pragma option push -b
-#pragma option push -a4
-#endif
-
 /*
 * First we include the configuration files that record how this
 * installation of the JPEG library is set up.  jconfig.h can be
@@ -37,11 +27,19 @@ extern "C" {
 #include "jmorecfg.h"		/* seldom changed options */


-/* Version ID for the JPEG library.
- * Might be useful for tests like "#if JPEG_LIB_VERSION >= 60".
+#ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
+extern "C" {
+#endif
+#endif
+
+/* Version IDs for the JPEG library.
+ * Might be useful for tests like "#if JPEG_LIB_VERSION >= 90".
 */

-#define JPEG_LIB_VERSION  62	/* Version 6b */
+#define JPEG_LIB_VERSION        90	/* Compatibility version 9.0 */
+#define JPEG_LIB_VERSION_MAJOR  9
+#define JPEG_LIB_VERSION_MINOR  0


 /* Various constants determining the sizes of things.
@@ -49,7 +47,7 @@ extern "C" {
 * if you want to be compatible.
 */

-#define DCTSIZE		    8	/* The basic DCT block is 8x8 samples */
+#define DCTSIZE		    8	/* The basic DCT block is 8x8 coefficients */
 #define DCTSIZE2	    64	/* DCTSIZE squared; # of elements in a block */
 #define NUM_QUANT_TBLS      4	/* Quantization tables are numbered 0..3 */
 #define NUM_HUFF_TBLS       4	/* Huffman tables are numbered 0..3 */
@@ -149,18 +147,18 @@ typedef struct {
   */
  JDIMENSION width_in_blocks;
  JDIMENSION height_in_blocks;
-  /* Size of a DCT block in samples.  Always DCTSIZE for compression.
-   * For decompression this is the size of the output from one DCT block,
-   * reflecting any scaling we choose to apply during the IDCT step.
-   * Values of 1,2,4,8 are likely to be supported.  Note that different
-   * components may receive different IDCT scalings.
+  /* Size of a DCT block in samples,
+   * reflecting any scaling we choose to apply during the DCT step.
+   * Values from 1 to 16 are supported.
+   * Note that different components may receive different DCT scalings.
   */
-  int DCT_scaled_size;
+  int DCT_h_scaled_size;
+  int DCT_v_scaled_size;
  /* The downsampled dimensions are the component's actual, unpadded number
-   * of samples at the main buffer (preprocessing/compression interface), thus
-   * downsampled_width = ceil(image_width * Hi/Hmax)
-   * and similarly for height.  For decompression, IDCT scaling is included, so
-   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_scaled_size/DCTSIZE)
+   * of samples at the main buffer (preprocessing/compression interface);
+   * DCT scaling is included, so
+   * downsampled_width = ceil(image_width * Hi/Hmax * DCT_h_scaled_size/DCTSIZE)
+   * and similarly for height.
   */
  JDIMENSION downsampled_width;	 /* actual width in samples */
  JDIMENSION downsampled_height; /* actual height in samples */
@@ -175,7 +173,7 @@ typedef struct {
  int MCU_width;		/* number of blocks per MCU, horizontally */
  int MCU_height;		/* number of blocks per MCU, vertically */
  int MCU_blocks;		/* MCU_width * MCU_height */
-  int MCU_sample_width;		/* MCU width in samples, MCU_width*DCT_scaled_size */
+  int MCU_sample_width;	/* MCU width in samples: MCU_width * DCT_h_scaled_size */
  int last_col_width;		/* # of non-dummy blocks across in last MCU */
  int last_row_height;		/* # of non-dummy blocks down in last MCU */

@@ -223,6 +221,13 @@ typedef enum {
        JCS_YCCK		/* Y/Cb/Cr/K */
 } J_COLOR_SPACE;

+/* Supported color transforms. */
+
+typedef enum {
+        JCT_NONE           = 0,
+        JCT_SUBTRACT_GREEN = 1
+} J_COLOR_TRANSFORM;
+
 /* DCT/IDCT algorithm options. */

 typedef enum {
@@ -302,6 +307,17 @@ struct jpeg_compress_struct {
   * helper routines to simplify changing parameters.
   */

+  unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+
+  JDIMENSION jpeg_width;	/* scaled JPEG image width */
+  JDIMENSION jpeg_height;	/* scaled JPEG image height */
+  /* Dimensions of actual JPEG image that will be written to file,
+   * derived from input dimensions by scaling factors above.
+   * These fields are computed by jpeg_start_compress().
+   * You can also use jpeg_calc_jpeg_dimensions() to determine these values
+   * in advance of calling jpeg_start_compress().
+   */
+
  int data_precision;		/* bits of precision in image data */

  int num_components;		/* # of color components in JPEG image */
@@ -311,7 +327,10 @@ struct jpeg_compress_struct {
  /* comp_info[i] describes component that appears i'th in SOF */

  JQUANT_TBL * quant_tbl_ptrs[NUM_QUANT_TBLS];
-  /* ptrs to coefficient quantization tables, or NULL if not defined */
+  int q_scale_factor[NUM_QUANT_TBLS];
+  /* ptrs to coefficient quantization tables, or NULL if not defined,
+   * and corresponding scale factors (percentage, initialized 100).
+   */

  JHUFF_TBL * dc_huff_tbl_ptrs[NUM_HUFF_TBLS];
  JHUFF_TBL * ac_huff_tbl_ptrs[NUM_HUFF_TBLS];
@@ -332,6 +351,7 @@ struct jpeg_compress_struct {
  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */
  boolean optimize_coding;	/* TRUE=optimize entropy encoding parms */
  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */
+  boolean do_fancy_downsampling; /* TRUE=apply fancy downsampling */
  int smoothing_factor;		/* 1..100, or 0 for no input smoothing */
  J_DCT_METHOD dct_method;	/* DCT algorithm selector */

@@ -357,6 +377,9 @@ struct jpeg_compress_struct {
  UINT16 Y_density;		/* Vertical pixel density */
  boolean write_Adobe_marker;	/* should an Adobe marker be written? */

+  J_COLOR_TRANSFORM color_transform;
+  /* Color transform identifier, writes LSE marker if nonzero */
+
  /* State variable: index of next scanline to be written to
   * jpeg_write_scanlines().  Application may use this to control its
   * processing loop, e.g., "while (next_scanline < image_height)".
@@ -375,6 +398,9 @@ struct jpeg_compress_struct {
  int max_h_samp_factor;	/* largest h_samp_factor */
  int max_v_samp_factor;	/* largest v_samp_factor */

+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */
+
  JDIMENSION total_iMCU_rows;	/* # of iMCU rows to be input to coef ctlr */
  /* The coefficient controller receives data in units of MCU rows as defined
   * for fully interleaved scans (whether the JPEG file is interleaved or not).
@@ -400,6 +426,10 @@ struct jpeg_compress_struct {

  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */

+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order;	/* natural-order position array */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) */
+
  /*
   * Links to compression subobjects (methods and private variables of modules)
   */
@@ -546,6 +576,7 @@ struct jpeg_decompress_struct {
  jpeg_component_info * comp_info;
  /* comp_info[i] describes component that appears i'th in SOF */

+  boolean is_baseline;		/* TRUE if Baseline SOF0 encountered */
  boolean progressive_mode;	/* TRUE if SOFn specifies progressive mode */
  boolean arith_code;		/* TRUE=arithmetic coding, FALSE=Huffman */

@@ -568,6 +599,9 @@ struct jpeg_decompress_struct {
  boolean saw_Adobe_marker;	/* TRUE iff an Adobe APP14 marker was found */
  UINT8 Adobe_transform;	/* Color transform code from Adobe marker */

+  J_COLOR_TRANSFORM color_transform;
+  /* Color transform identifier derived from LSE marker, otherwise zero */
+
  boolean CCIR601_sampling;	/* TRUE=first samples are cosited */

  /* Aside from the specific data retained from APPn markers known to the
@@ -586,7 +620,8 @@ struct jpeg_decompress_struct {
  int max_h_samp_factor;	/* largest h_samp_factor */
  int max_v_samp_factor;	/* largest v_samp_factor */

-  int min_DCT_scaled_size;	/* smallest DCT_scaled_size of any component */
+  int min_DCT_h_scaled_size;	/* smallest DCT_h_scaled_size of any component */
+  int min_DCT_v_scaled_size;	/* smallest DCT_v_scaled_size of any component */

  JDIMENSION total_iMCU_rows;	/* # of iMCU rows in image */
  /* The coefficient controller's input and output progress is measured in
@@ -594,7 +629,7 @@ struct jpeg_decompress_struct {
   * in fully interleaved JPEG scans, but are used whether the scan is
   * interleaved or not.  We define an iMCU row as v_samp_factor DCT block
   * rows of each component.  Therefore, the IDCT output contains
-   * v_samp_factor*DCT_scaled_size sample rows of a component per iMCU row.
+   * v_samp_factor*DCT_v_scaled_size sample rows of a component per iMCU row.
   */

  JSAMPLE * sample_range_limit; /* table for fast range-limiting */
@@ -618,6 +653,12 @@ struct jpeg_decompress_struct {

  int Ss, Se, Ah, Al;		/* progressive JPEG parameters for scan */

+  /* These fields are derived from Se of first SOS marker.
+   */
+  int block_size;		/* the basic DCT block size: 1..16 */
+  const int * natural_order; /* natural-order position array for entropy decode */
+  int lim_Se;			/* min( Se, DCTSIZE2-1 ) for entropy decode */
+
  /* This field is shared between entropy decoder and marker parser.
   * It is either zero or the code of a JPEG marker that has been
   * read from the data source, but has not yet been processed.
@@ -653,7 +694,7 @@ struct jpeg_decompress_struct {

 struct jpeg_error_mgr {
  /* Error exit handler: does not return to caller */
-  JMETHOD(void, error_exit, (j_common_ptr cinfo));
+  JMETHOD(noreturn_t, error_exit, (j_common_ptr cinfo));
  /* Conditionally emit a trace or warning message */
  JMETHOD(void, emit_message, (j_common_ptr cinfo, int msg_level));
  /* Routine that actually outputs a trace or error message */
@@ -847,11 +888,14 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_destroy_decompress	jDestDecompress
 #define jpeg_stdio_dest		jStdDest
 #define jpeg_stdio_src		jStdSrc
+#define jpeg_mem_dest		jMemDest
+#define jpeg_mem_src		jMemSrc
 #define jpeg_set_defaults	jSetDefaults
 #define jpeg_set_colorspace	jSetColorspace
 #define jpeg_default_colorspace	jDefColorspace
 #define jpeg_set_quality	jSetQuality
 #define jpeg_set_linear_quality	jSetLQuality
+#define jpeg_default_qtables	jDefQTables
 #define jpeg_add_quant_table	jAddQuantTable
 #define jpeg_quality_scaling	jQualityScaling
 #define jpeg_simple_progression	jSimProgress
@@ -861,6 +905,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_start_compress	jStrtCompress
 #define jpeg_write_scanlines	jWrtScanlines
 #define jpeg_finish_compress	jFinCompress
+#define jpeg_calc_jpeg_dimensions	jCjpegDimensions
 #define jpeg_write_raw_data	jWrtRawData
 #define jpeg_write_marker	jWrtMarker
 #define jpeg_write_m_header	jWrtMHeader
@@ -877,6 +922,7 @@ typedef JMETHOD(boolean, jpeg_marker_parser_method, (j_decompress_ptr cinfo));
 #define jpeg_input_complete	jInComplete
 #define jpeg_new_colormap	jNewCMap
 #define jpeg_consume_input	jConsumeInput
+#define jpeg_core_output_dimensions	jCoreDimensions
 #define jpeg_calc_output_dimensions	jCalcDimensions
 #define jpeg_save_markers	jSaveMarkers
 #define jpeg_set_marker_processor	jSetMarker
@@ -921,6 +967,14 @@ EXTERN(void) jpeg_destroy_decompress JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_stdio_dest JPP((j_compress_ptr cinfo, FILE * outfile));
 EXTERN(void) jpeg_stdio_src JPP((j_decompress_ptr cinfo, FILE * infile));

+/* Data source and destination managers: memory buffers. */
+EXTERN(void) jpeg_mem_dest JPP((j_compress_ptr cinfo,
+                               unsigned char ** outbuffer,
+                               unsigned long * outsize));
+EXTERN(void) jpeg_mem_src JPP((j_decompress_ptr cinfo,
+                              unsigned char * inbuffer,
+                              unsigned long insize));
+
 /* Default parameter setup for compression */
 EXTERN(void) jpeg_set_defaults JPP((j_compress_ptr cinfo));
 /* Compression parameter setup aids */
@@ -932,6 +986,8 @@ EXTERN(void) jpeg_set_quality JPP((j_compress_ptr cinfo, int quality,
 EXTERN(void) jpeg_set_linear_quality JPP((j_compress_ptr cinfo,
                                          int scale_factor,
                                          boolean force_baseline));
+EXTERN(void) jpeg_default_qtables JPP((j_compress_ptr cinfo,
+                                       boolean force_baseline));
 EXTERN(void) jpeg_add_quant_table JPP((j_compress_ptr cinfo, int which_tbl,
                                       const unsigned int *basic_table,
                                       int scale_factor,
@@ -951,12 +1007,15 @@ EXTERN(JDIMENSION) jpeg_write_scanlines JPP((j_compress_ptr cinfo,
                                             JDIMENSION num_lines));
 EXTERN(void) jpeg_finish_compress JPP((j_compress_ptr cinfo));

+/* Precalculate JPEG dimensions for current compression parameters. */
+EXTERN(void) jpeg_calc_jpeg_dimensions JPP((j_compress_ptr cinfo));
+
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
 EXTERN(JDIMENSION) jpeg_write_raw_data JPP((j_compress_ptr cinfo,
                                            JSAMPIMAGE data,
                                            JDIMENSION num_lines));

-/* Write a special marker.  See libjpeg.doc concerning safe usage. */
+/* Write a special marker.  See libjpeg.txt concerning safe usage. */
 EXTERN(void) jpeg_write_marker
        JPP((j_compress_ptr cinfo, int marker,
             const JOCTET * dataptr, unsigned int datalen));
@@ -1010,6 +1069,7 @@ EXTERN(int) jpeg_consume_input JPP((j_decompress_ptr cinfo));
 #define JPEG_SCAN_COMPLETED	4 /* Completed last iMCU row of a scan */

 /* Precalculate output dimensions for current decompression parameters. */
+EXTERN(void) jpeg_core_output_dimensions JPP((j_decompress_ptr cinfo));
 EXTERN(void) jpeg_calc_output_dimensions JPP((j_decompress_ptr cinfo));

 /* Control saving of COM and APPn markers into marker_list. */
@@ -1104,13 +1164,10 @@ struct jpeg_color_quantizer { long dummy; };
 #include "jerror.h"		/* fetch error codes too */
 #endif

-#ifdef __BORLANDC__
-#pragma option pop /* pop -a switch */
-#pragma option pop /* pop -b */
-#endif
-
 #ifdef __cplusplus
+#ifndef DONT_USE_EXTERN_C
 }
 #endif
+#endif

 #endif /* JPEGLIB_H */
--- a/3rdparty/libjpeg/jquant1.c
+++ b/3rdparty/libjpeg/jquant1.c
@@ -2,6 +2,7 @@
 * jquant1.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -530,7 +531,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,

  for (row = 0; row < num_rows; row++) {
    /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
+    FMEMZERO((void FAR *) output_buf[row],
             (size_t) (width * SIZEOF(JSAMPLE)));
    row_index = cquantize->row_index;
    for (ci = 0; ci < nc; ci++) {
@@ -635,7 +636,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,

  for (row = 0; row < num_rows; row++) {
    /* Initialize output values to 0 so can process components separately */
-    jzero_far((void FAR *) output_buf[row],
+    FMEMZERO((void FAR *) output_buf[row],
             (size_t) (width * SIZEOF(JSAMPLE)));
    for (ci = 0; ci < nc; ci++) {
      input_ptr = input_buf[row] + ci;
@@ -781,7 +782,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
    /* Initialize the propagated errors to zero. */
    arraysize = (size_t) ((cinfo->output_width + 2) * SIZEOF(FSERROR));
    for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void FAR *) cquantize->fserrors[i], arraysize);
+      FMEMZERO((void FAR *) cquantize->fserrors[i], arraysize);
    break;
  default:
    ERREXIT(cinfo, JERR_NOT_COMPILED);
--- a/3rdparty/libjpeg/jquant2.c
+++ b/3rdparty/libjpeg/jquant2.c
@@ -2,6 +2,7 @@
 * jquant2.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -1203,7 +1204,7 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
        cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
          ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
      /* Initialize the propagated errors to zero. */
-      jzero_far((void FAR *) cquantize->fserrors, arraysize);
+      FMEMZERO((void FAR *) cquantize->fserrors, arraysize);
      /* Make the error-limit table if we didn't already. */
      if (cquantize->error_limiter == NULL)
        init_error_limit(cinfo);
@@ -1214,7 +1215,7 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
  /* Zero the histogram or inverse color map, if necessary */
  if (cquantize->needs_zeroed) {
    for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void FAR *) histogram[i],
+      FMEMZERO((void FAR *) histogram[i],
               HIST_C1_ELEMS*HIST_C2_ELEMS * SIZEOF(histcell));
    }
    cquantize->needs_zeroed = FALSE;
--- a/3rdparty/libjpeg/jutils.c
+++ b/3rdparty/libjpeg/jutils.c
@@ -2,6 +2,7 @@
 * jutils.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
+ * Modified 2009-2011 by Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -63,6 +64,57 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
 63, 63, 63, 63, 63, 63, 63, 63
 };

+const int jpeg_natural_order7[7*7+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13,  6, 14, 21, 28, 35,
+ 42, 49, 50, 43, 36, 29, 22, 30,
+ 37, 44, 51, 52, 45, 38, 46, 53,
+ 54,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order6[6*6+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4,  5,
+ 12, 19, 26, 33, 40, 41, 34, 27,
+ 20, 13, 21, 28, 35, 42, 43, 36,
+ 29, 37, 44, 45,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order5[5*5+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 32, 25, 18, 11,  4, 12,
+ 19, 26, 33, 34, 27, 20, 28, 35,
+ 36,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order4[4*4+16] = {
+  0,  1,  8, 16,  9,  2,  3, 10,
+ 17, 24, 25, 18, 11, 19, 26, 27,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order3[3*3+16] = {
+  0,  1,  8, 16,  9,  2, 10, 17,
+ 18,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int jpeg_natural_order2[2*2+16] = {
+  0,  1,  8,  9,
+ 63, 63, 63, 63, 63, 63, 63, 63, /* extra entries for safety in decoder */
+ 63, 63, 63, 63, 63, 63, 63, 63
+};
+

 /*
 * Arithmetic utilities
@@ -96,13 +148,27 @@ jround_up (long a, long b)
 * is not all that great, because these routines aren't very heavily used.)
 */

-#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macros */
+#ifndef NEED_FAR_POINTERS	/* normal case, same as regular macro */
 #define FMEMCOPY(dest,src,size)	MEMCOPY(dest,src,size)
-#define FMEMZERO(target,size)	MEMZERO(target,size)
 #else				/* 80x86 case, define if we can */
 #ifdef USE_FMEM
 #define FMEMCOPY(dest,src,size)	_fmemcpy((void FAR *)(dest), (const void FAR *)(src), (size_t)(size))
-#define FMEMZERO(target,size)	_fmemset((void FAR *)(target), 0, (size_t)(size))
+#else
+/* This function is for use by the FMEMZERO macro defined in jpegint.h.
+ * Do not call this function directly, use the FMEMZERO macro instead.
+ */
+GLOBAL(void)
+jzero_far (void FAR * target, size_t bytestozero)
+/* Zero out a chunk of FAR memory. */
+/* This might be sample-array data, block-array data, or alloc_large data. */
+{
+  register char FAR * ptr = (char FAR *) target;
+  register size_t count;
+
+  for (count = bytestozero; count > 0; count--) {
+    *ptr++ = 0;
+  }
+}
 #endif
 #endif

@@ -159,21 +225,3 @@ jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
  }
 #endif
 }
-
-
-GLOBAL(void)
-jzero_far (void FAR * target, size_t bytestozero)
-/* Zero out a chunk of FAR memory. */
-/* This might be sample-array data, block-array data, or alloc_large data. */
-{
-#ifdef FMEMZERO
-  FMEMZERO(target, bytestozero);
-#else
-  register char FAR * ptr = (char FAR *) target;
-  register size_t count;
-
-  for (count = bytestozero; count > 0; count--) {
-    *ptr++ = 0;
-  }
-#endif
-}
--- a/3rdparty/libjpeg/jversion.h
+++ b/3rdparty/libjpeg/jversion.h
@@ -1,7 +1,7 @@
 /*
 * jversion.h
 *
- * Copyright (C) 1991-1998, Thomas G. Lane.
+ * Copyright (C) 1991-2013, Thomas G. Lane, Guido Vollbeding.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
@@ -9,6 +9,6 @@
 */


-#define JVERSION	"6b  27-Mar-1998"
+#define JVERSION	"9  13-Jan-2013"

-#define JCOPYRIGHT	"Copyright (C) 1998, Thomas G. Lane"
+#define JCOPYRIGHT	"Copyright (C) 2013, Thomas G. Lane, Guido Vollbeding"
--- a/3rdparty/libjpeg/transupp.c
+++ b/3rdparty/libjpeg/transupp.c
--- a/3rdparty/libjpeg/transupp.h
+++ b/3rdparty/libjpeg/transupp.h
@@ -1,205 +0,0 @@
-/*
- * transupp.h
- *
- * Copyright (C) 1997-2001, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
- * For conditions of distribution and use, see the accompanying README file.
- *
- * This file contains declarations for image transformation routines and
- * other utility code used by the jpegtran sample application.  These are
- * NOT part of the core JPEG library.  But we keep these routines separate
- * from jpegtran.c to ease the task of maintaining jpegtran-like programs
- * that have other user interfaces.
- *
- * NOTE: all the routines declared here have very specific requirements
- * about when they are to be executed during the reading and writing of the
- * source and destination files.  See the comments in transupp.c, or see
- * jpegtran.c for an example of correct usage.
- */
-
-/* If you happen not to want the image transform support, disable it here */
-#ifndef TRANSFORMS_SUPPORTED
-#define TRANSFORMS_SUPPORTED 1		/* 0 disables transform code */
-#endif
-
-/*
- * Although rotating and flipping data expressed as DCT coefficients is not
- * hard, there is an asymmetry in the JPEG format specification for images
- * whose dimensions aren't multiples of the iMCU size.  The right and bottom
- * image edges are padded out to the next iMCU boundary with junk data; but
- * no padding is possible at the top and left edges.  If we were to flip
- * the whole image including the pad data, then pad garbage would become
- * visible at the top and/or left, and real pixels would disappear into the
- * pad margins --- perhaps permanently, since encoders & decoders may not
- * bother to preserve DCT blocks that appear to be completely outside the
- * nominal image area.  So, we have to exclude any partial iMCUs from the
- * basic transformation.
- *
- * Transpose is the only transformation that can handle partial iMCUs at the
- * right and bottom edges completely cleanly.  flip_h can flip partial iMCUs
- * at the bottom, but leaves any partial iMCUs at the right edge untouched.
- * Similarly flip_v leaves any partial iMCUs at the bottom edge untouched.
- * The other transforms are defined as combinations of these basic transforms
- * and process edge blocks in a way that preserves the equivalence.
- *
- * The "trim" option causes untransformable partial iMCUs to be dropped;
- * this is not strictly lossless, but it usually gives the best-looking
- * result for odd-size images.  Note that when this option is active,
- * the expected mathematical equivalences between the transforms may not hold.
- * (For example, -rot 270 -trim trims only the bottom edge, but -rot 90 -trim
- * followed by -rot 180 -trim trims both edges.)
- *
- * We also offer a lossless-crop option, which discards data outside a given
- * image region but losslessly preserves what is inside.  Like the rotate and
- * flip transforms, lossless crop is restricted by the JPEG format: the upper
- * left corner of the selected region must fall on an iMCU boundary.  If this
- * does not hold for the given crop parameters, we silently move the upper left
- * corner up and/or left to make it so, simultaneously increasing the region
- * dimensions to keep the lower right crop corner unchanged.  (Thus, the
- * output image covers at least the requested region, but may cover more.)
- *
- * If both crop and a rotate/flip transform are requested, the crop is applied
- * last --- that is, the crop region is specified in terms of the destination
- * image.
- *
- * We also offer a "force to grayscale" option, which simply discards the
- * chrominance channels of a YCbCr image.  This is lossless in the sense that
- * the luminance channel is preserved exactly.  It's not the same kind of
- * thing as the rotate/flip transformations, but it's convenient to handle it
- * as part of this package, mainly because the transformation routines have to
- * be aware of the option to know how many components to work on.
- */
-
-
-/* Short forms of external names for systems with brain-damaged linkers. */
-
-#ifdef NEED_SHORT_EXTERNAL_NAMES
-#define jtransform_parse_crop_spec	jTrParCrop
-#define jtransform_request_workspace	jTrRequest
-#define jtransform_adjust_parameters	jTrAdjust
-#define jtransform_execute_transform	jTrExec
-#define jtransform_perfect_transform	jTrPerfect
-#define jcopy_markers_setup		jCMrkSetup
-#define jcopy_markers_execute		jCMrkExec
-#endif /* NEED_SHORT_EXTERNAL_NAMES */
-
-
-/*
- * Codes for supported types of image transformations.
- */
-
-typedef enum {
-    JXFORM_NONE,		/* no transformation */
-    JXFORM_FLIP_H,		/* horizontal flip */
-    JXFORM_FLIP_V,		/* vertical flip */
-    JXFORM_TRANSPOSE,	/* transpose across UL-to-LR axis */
-    JXFORM_TRANSVERSE,	/* transpose across UR-to-LL axis */
-    JXFORM_ROT_90,		/* 90-degree clockwise rotation */
-    JXFORM_ROT_180,		/* 180-degree rotation */
-    JXFORM_ROT_270		/* 270-degree clockwise (or 90 ccw) */
-} JXFORM_CODE;
-
-/*
- * Codes for crop parameters, which can individually be unspecified,
- * positive, or negative.  (Negative width or height makes no sense, though.)
- */
-
-typedef enum {
-    JCROP_UNSET,
-    JCROP_POS,
-    JCROP_NEG
-} JCROP_CODE;
-
-/*
- * Transform parameters struct.
- * NB: application must not change any elements of this struct after
- * calling jtransform_request_workspace.
- */
-
-typedef struct {
-  /* Options: set by caller */
-  JXFORM_CODE transform;	/* image transform operator */
-  boolean perfect;		/* if TRUE, fail if partial MCUs are requested */
-  boolean trim;			/* if TRUE, trim partial MCUs as needed */
-  boolean force_grayscale;	/* if TRUE, convert color image to grayscale */
-  boolean crop;			/* if TRUE, crop source image */
-
-  /* Crop parameters: application need not set these unless crop is TRUE.
-   * These can be filled in by jtransform_parse_crop_spec().
-   */
-  JDIMENSION crop_width;	/* Width of selected region */
-  JCROP_CODE crop_width_set;
-  JDIMENSION crop_height;	/* Height of selected region */
-  JCROP_CODE crop_height_set;
-  JDIMENSION crop_xoffset;	/* X offset of selected region */
-  JCROP_CODE crop_xoffset_set;	/* (negative measures from right edge) */
-  JDIMENSION crop_yoffset;	/* Y offset of selected region */
-  JCROP_CODE crop_yoffset_set;	/* (negative measures from bottom edge) */
-
-  /* Internal workspace: caller should not touch these */
-  int num_components;		/* # of components in workspace */
-  jvirt_barray_ptr * workspace_coef_arrays; /* workspace for transformations */
-  JDIMENSION output_width;	/* cropped destination dimensions */
-  JDIMENSION output_height;
-  JDIMENSION x_crop_offset;	/* destination crop offsets measured in iMCUs */
-  JDIMENSION y_crop_offset;
-  int max_h_samp_factor;	/* destination iMCU size */
-  int max_v_samp_factor;
-} jpeg_transform_info;
-
-
-#if TRANSFORMS_SUPPORTED
-
-/* Parse a crop specification (written in X11 geometry style) */
-EXTERN(boolean) jtransform_parse_crop_spec
-    JPP((jpeg_transform_info *info, const char *spec));
-/* Request any required workspace */
-EXTERN(void) jtransform_request_workspace
-    JPP((j_decompress_ptr srcinfo, jpeg_transform_info *info));
-/* Adjust output image parameters */
-EXTERN(jvirt_barray_ptr *) jtransform_adjust_parameters
-    JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         jvirt_barray_ptr *src_coef_arrays,
-         jpeg_transform_info *info));
-/* Execute the actual transformation, if any */
-EXTERN(void) jtransform_execute_transform
-    JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         jvirt_barray_ptr *src_coef_arrays,
-         jpeg_transform_info *info));
-/* Determine whether lossless transformation is perfectly
- * possible for a specified image and transformation.
- */
-EXTERN(boolean) jtransform_perfect_transform
-    JPP((JDIMENSION image_width, JDIMENSION image_height,
-         int MCU_width, int MCU_height,
-         JXFORM_CODE transform));
-
-/* jtransform_execute_transform used to be called
- * jtransform_execute_transformation, but some compilers complain about
- * routine names that long.  This macro is here to avoid breaking any
- * old source code that uses the original name...
- */
-#define jtransform_execute_transformation	jtransform_execute_transform
-
-#endif /* TRANSFORMS_SUPPORTED */
-
-
-/*
- * Support for copying optional markers from source to destination file.
- */
-
-typedef enum {
-    JCOPYOPT_NONE,		/* copy no optional markers */
-    JCOPYOPT_COMMENTS,	/* copy only comment (COM) markers */
-    JCOPYOPT_ALL		/* copy all optional markers */
-} JCOPY_OPTION;
-
-#define JCOPYOPT_DEFAULT  JCOPYOPT_COMMENTS	/* recommended default */
-
-/* Setup decompression object to save desired markers in memory */
-EXTERN(void) jcopy_markers_setup
-    JPP((j_decompress_ptr srcinfo, JCOPY_OPTION option));
-/* Copy markers saved in the given source object to the destination object */
-EXTERN(void) jcopy_markers_execute
-    JPP((j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
-         JCOPY_OPTION option));
--- a/3rdparty/libtiff/CMakeLists.txt
+++ b/3rdparty/libtiff/CMakeLists.txt
@@ -89,12 +89,15 @@ endif(WIN32)

 ocv_warnings_disable(CMAKE_C_FLAGS -Wno-unused-but-set-variable -Wmissing-prototypes -Wmissing-declarations -Wundef -Wunused -Wsign-compare
                                   -Wcast-align -Wshadow -Wno-maybe-uninitialized -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast)
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter) # clang
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-declarations -Wunused-parameter)
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4018 /wd4100 /wd4127 /wd4311 /wd4701 /wd4706) # vs2005
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244) # vs2008
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4267 /wd4305 /wd4306) # vs2008 Win64
 ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4703) # vs2012

+ocv_warnings_disable(CMAKE_C_FLAGS /wd4267 /wd4244 /wd4018)
+
 if(UNIX AND (CMAKE_COMPILER_IS_GNUCXX OR CV_ICC))
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
 endif()
--- a/3rdparty/libwebp/CMakeLists.txt
+++ b/3rdparty/libwebp/CMakeLists.txt
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------------
+#  CMake file for libwebp. See root CMakeLists.txt
+#
+# ----------------------------------------------------------------------------
+project(${WEBP_LIBRARY})
+
+ocv_include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+ocv_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/cpu-features")
+
+file(GLOB lib_srcs dec/*.c dsp/*.c enc/*.c mux/*.c utils/*.c webp/*.c)
+file(GLOB lib_hdrs dec/*.h dsp/*.h enc/*.h mux/*.h utils/*.h webp/*.h)
+
+if(ANDROID AND ARMEABI_V7A AND NOT NEON)
+  foreach(file ${lib_srcs})
+    if("${file}" MATCHES "_neon.c")
+      set_source_files_properties("${file}" COMPILE_FLAGS "-mfpu=neon")
+    endif()
+  endforeach()
+endif()
+
+file(GLOB cpuf_s cpu-features/*.c)
+file(GLOB cpuf_h cpu-features/*.h)
+
+if(ANDROID)
+  set(lib_srcs ${lib_srcs} ${cpuf_s})
+  set(lib_hdrs ${lib_hdrs} ${cpuf_h})
+endif()
+
+# ----------------------------------------------------------------------------------
+#         Define the library target:
+# ----------------------------------------------------------------------------------
+
+add_definitions(-DWEBP_USE_THREAD)
+
+add_library(${WEBP_LIBRARY} STATIC ${lib_srcs} ${lib_hdrs})
+
+if(UNIX)
+  if(CMAKE_COMPILER_IS_GNUCXX OR CV_ICC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  endif()
+endif()
+
+ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-variable -Wshadow)
+ocv_warnings_disable(CMAKE_C_FLAGS /wd4244 /wd4267) # vs2005
+
+set_target_properties(${WEBP_LIBRARY}
+  PROPERTIES OUTPUT_NAME ${WEBP_LIBRARY}
+  DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
+  ARCHIVE_OUTPUT_DIRECTORY ${3P_LIBRARY_OUTPUT_PATH}
+  )
+
+if(ENABLE_SOLUTION_FOLDERS)
+  set_target_properties(${WEBP_LIBRARY} PROPERTIES FOLDER "3rdparty")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+  install(TARGETS ${WEBP_LIBRARY} ARCHIVE DESTINATION ${OPENCV_3P_LIB_INSTALL_PATH} COMPONENT main)
+endif()
--- a/3rdparty/libwebp/cpu-features/cpu-features.c
+++ b/3rdparty/libwebp/cpu-features/cpu-features.c
@@ -0,0 +1,971 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ChangeLog for this library:
+ *
+ * NDK r8d: Add android_setCpu().
+ *
+ * NDK r8c: Add new ARM CPU features: VFPv2, VFP_D32, VFP_FP16,
+ *          VFP_FMA, NEON_FMA, IDIV_ARM, IDIV_THUMB2 and iWMMXt.
+ *
+ *          Rewrite the code to parse /proc/self/auxv instead of
+ *          the "Features" field in /proc/cpuinfo.
+ *
+ *          Dynamically allocate the buffer that hold the content
+ *          of /proc/cpuinfo to deal with newer hardware.
+ *
+ * NDK r7c: Fix CPU count computation. The old method only reported the
+ *           number of _active_ CPUs when the library was initialized,
+ *           which could be less than the real total.
+ *
+ * NDK r5: Handle buggy kernels which report a CPU Architecture number of 7
+ *         for an ARMv6 CPU (see below).
+ *
+ *         Handle kernels that only report 'neon', and not 'vfpv3'
+ *         (VFPv3 is mandated by the ARM architecture is Neon is implemented)
+ *
+ *         Handle kernels that only report 'vfpv3d16', and not 'vfpv3'
+ *
+ *         Fix x86 compilation. Report ANDROID_CPU_FAMILY_X86 in
+ *         android_getCpuFamily().
+ *
+ * NDK r4: Initial release
+ */
+#include <sys/system_properties.h>
+#ifdef __arm__
+#include <machine/cpu-features.h>
+#endif
+#include <pthread.h>
+#include "cpu-features.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+
+static  pthread_once_t     g_once;
+static  int                g_inited;
+static  AndroidCpuFamily   g_cpuFamily;
+static  uint64_t           g_cpuFeatures;
+static  int                g_cpuCount;
+
+static const int  android_cpufeatures_debug = 0;
+
+#ifdef __arm__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_ARM
+#elif defined __i386__
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_X86
+#else
+#  define DEFAULT_CPU_FAMILY  ANDROID_CPU_FAMILY_UNKNOWN
+#endif
+
+#define  D(...) \
+    do { \
+        if (android_cpufeatures_debug) { \
+            printf(__VA_ARGS__); fflush(stdout); \
+        } \
+    } while (0)
+
+#ifdef __i386__
+static __inline__ void x86_cpuid(int func, int values[4])
+{
+    int a, b, c, d;
+    /* We need to preserve ebx since we're compiling PIC code */
+    /* this means we can't use "=b" for the second output register */
+    __asm__ __volatile__ ( \
+      "push %%ebx\n"
+      "cpuid\n" \
+      "mov %%ebx, %1\n"
+      "pop %%ebx\n"
+      : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
+      : "a" (func) \
+    );
+    values[0] = a;
+    values[1] = b;
+    values[2] = c;
+    values[3] = d;
+}
+#endif
+
+/* Get the size of a file by reading it until the end. This is needed
+ * because files under /proc do not always return a valid size when
+ * using fseek(0, SEEK_END) + ftell(). Nor can they be mmap()-ed.
+ */
+static int
+get_file_size(const char* pathname)
+{
+    int fd, ret, result = 0;
+    char buffer[256];
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Can't open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+
+    for (;;) {
+        int ret = read(fd, buffer, sizeof buffer);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", pathname, strerror(errno));
+            break;
+        }
+        if (ret == 0)
+            break;
+
+        result += ret;
+    }
+    close(fd);
+    return result;
+}
+
+/* Read the content of /proc/cpuinfo into a user-provided buffer.
+ * Return the length of the data, or -1 on error. Does *not*
+ * zero-terminate the content. Will not read more
+ * than 'buffsize' bytes.
+ */
+static int
+read_file(const char*  pathname, char*  buffer, size_t  buffsize)
+{
+    int  fd, count;
+
+    fd = open(pathname, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", pathname, strerror(errno));
+        return -1;
+    }
+    count = 0;
+    while (count < (int)buffsize) {
+        int ret = read(fd, buffer + count, buffsize - count);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading from %s: %s\n", pathname, strerror(errno));
+            if (count == 0)
+                count = -1;
+            break;
+        }
+        if (ret == 0)
+            break;
+        count += ret;
+    }
+    close(fd);
+    return count;
+}
+
+/* Extract the content of a the first occurence of a given field in
+ * the content of /proc/cpuinfo and return it as a heap-allocated
+ * string that must be freed by the caller.
+ *
+ * Return NULL if not found
+ */
+static char*
+extract_cpuinfo_field(const char* buffer, int buflen, const char* field)
+{
+    int  fieldlen = strlen(field);
+    const char* bufend = buffer + buflen;
+    char* result = NULL;
+    int len, ignore;
+    const char *p, *q;
+
+    /* Look for first field occurence, and ensures it starts the line. */
+    p = buffer;
+    bufend = buffer + buflen;
+    for (;;) {
+        p = memmem(p, bufend-p, field, fieldlen);
+        if (p == NULL)
+            goto EXIT;
+
+        if (p == buffer || p[-1] == '\n')
+            break;
+
+        p += fieldlen;
+    }
+
+    /* Skip to the first column followed by a space */
+    p += fieldlen;
+    p  = memchr(p, ':', bufend-p);
+    if (p == NULL || p[1] != ' ')
+        goto EXIT;
+
+    /* Find the end of the line */
+    p += 2;
+    q = memchr(p, '\n', bufend-p);
+    if (q == NULL)
+        q = bufend;
+
+    /* Copy the line into a heap-allocated buffer */
+    len = q-p;
+    result = malloc(len+1);
+    if (result == NULL)
+        goto EXIT;
+
+    memcpy(result, p, len);
+    result[len] = '\0';
+
+EXIT:
+    return result;
+}
+
+/* Like strlen(), but for constant string literals */
+#define STRLEN_CONST(x)  ((sizeof(x)-1)
+
+
+/* Checks that a space-separated list of items contains one given 'item'.
+ * Returns 1 if found, 0 otherwise.
+ */
+static int
+has_list_item(const char* list, const char* item)
+{
+    const char*  p = list;
+    int itemlen = strlen(item);
+
+    if (list == NULL)
+        return 0;
+
+    while (*p) {
+        const char*  q;
+
+        /* skip spaces */
+        while (*p == ' ' || *p == '\t')
+            p++;
+
+        /* find end of current list item */
+        q = p;
+        while (*q && *q != ' ' && *q != '\t')
+            q++;
+
+        if (itemlen == q-p && !memcmp(p, item, itemlen))
+            return 1;
+
+        /* skip to next item */
+        p = q;
+    }
+    return 0;
+}
+
+/* Parse an decimal integer starting from 'input', but not going further
+ * than 'limit'. Return the value into '*result'.
+ *
+ * NOTE: Does not skip over leading spaces, or deal with sign characters.
+ * NOTE: Ignores overflows.
+ *
+ * The function returns NULL in case of error (bad format), or the new
+ * position after the decimal number in case of success (which will always
+ * be <= 'limit').
+ */
+static const char*
+parse_decimal(const char* input, const char* limit, int* result)
+{
+    const char* p = input;
+    int val = 0;
+    while (p < limit) {
+        int d = (*p - '0');
+        if ((unsigned)d >= 10U)
+            break;
+        val = val*10 + d;
+        p++;
+    }
+    if (p == input)
+        return NULL;
+
+    *result = val;
+    return p;
+}
+
+/* This small data type is used to represent a CPU list / mask, as read
+ * from sysfs on Linux. See http://www.kernel.org/doc/Documentation/cputopology.txt
+ *
+ * For now, we don't expect more than 32 cores on mobile devices, so keep
+ * everything simple.
+ */
+typedef struct {
+    uint32_t mask;
+} CpuList;
+
+static __inline__ void
+cpulist_init(CpuList* list) {
+    list->mask = 0;
+}
+
+static __inline__ void
+cpulist_and(CpuList* list1, CpuList* list2) {
+    list1->mask &= list2->mask;
+}
+
+static __inline__ void
+cpulist_set(CpuList* list, int index) {
+    if ((unsigned)index < 32) {
+        list->mask |= (uint32_t)(1U << index);
+    }
+}
+
+static __inline__ int
+cpulist_count(CpuList* list) {
+    return __builtin_popcount(list->mask);
+}
+
+/* Parse a textual list of cpus and store the result inside a CpuList object.
+ * Input format is the following:
+ * - comma-separated list of items (no spaces)
+ * - each item is either a single decimal number (cpu index), or a range made
+ *   of two numbers separated by a single dash (-). Ranges are inclusive.
+ *
+ * Examples:   0
+ *             2,4-127,128-143
+ *             0-1
+ */
+static void
+cpulist_parse(CpuList* list, const char* line, int line_len)
+{
+    const char* p = line;
+    const char* end = p + line_len;
+    const char* q;
+
+    /* NOTE: the input line coming from sysfs typically contains a
+     * trailing newline, so take care of it in the code below
+     */
+    while (p < end && *p != '\n')
+    {
+        int val, start_value, end_value;
+
+        /* Find the end of current item, and put it into 'q' */
+        q = memchr(p, ',', end-p);
+        if (q == NULL) {
+            q = end;
+        }
+
+        /* Get first value */
+        p = parse_decimal(p, q, &start_value);
+        if (p == NULL)
+            goto BAD_FORMAT;
+
+        end_value = start_value;
+
+        /* If we're not at the end of the item, expect a dash and
+         * and integer; extract end value.
+         */
+        if (p < q && *p == '-') {
+            p = parse_decimal(p+1, q, &end_value);
+            if (p == NULL)
+                goto BAD_FORMAT;
+        }
+
+        /* Set bits CPU list bits */
+        for (val = start_value; val <= end_value; val++) {
+            cpulist_set(list, val);
+        }
+
+        /* Jump to next item */
+        p = q;
+        if (p < end)
+            p++;
+    }
+
+BAD_FORMAT:
+    ;
+}
+
+/* Read a CPU list from one sysfs file */
+static void
+cpulist_read_from(CpuList* list, const char* filename)
+{
+    char   file[64];
+    int    filelen;
+
+    cpulist_init(list);
+
+    filelen = read_file(filename, file, sizeof file);
+    if (filelen < 0) {
+        D("Could not read %s: %s\n", filename, strerror(errno));
+        return;
+    }
+
+    cpulist_parse(list, file, filelen);
+}
+
+// See <asm/hwcap.h> kernel header.
+#define HWCAP_VFP       (1 << 6)
+#define HWCAP_IWMMXT    (1 << 9)
+#define HWCAP_NEON      (1 << 12)
+#define HWCAP_VFPv3     (1 << 13)
+#define HWCAP_VFPv3D16  (1 << 14)
+#define HWCAP_VFPv4     (1 << 16)
+#define HWCAP_IDIVA     (1 << 17)
+#define HWCAP_IDIVT     (1 << 18)
+
+#define AT_HWCAP 16
+
+#if defined(__arm__)
+/* Compute the ELF HWCAP flags.
+ */
+static uint32_t
+get_elf_hwcap(const char* cpuinfo, int cpuinfo_len)
+{
+  /* IMPORTANT:
+   *   Accessing /proc/self/auxv doesn't work anymore on all
+   *   platform versions. More specifically, when running inside
+   *   a regular application process, most of /proc/self/ will be
+   *   non-readable, including /proc/self/auxv. This doesn't
+   *   happen however if the application is debuggable, or when
+   *   running under the "shell" UID, which is why this was not
+   *   detected appropriately.
+   */
+#if 0
+    uint32_t result = 0;
+    const char filepath[] = "/proc/self/auxv";
+    int fd = open(filepath, O_RDONLY);
+    if (fd < 0) {
+        D("Could not open %s: %s\n", filepath, strerror(errno));
+        return 0;
+    }
+
+    struct { uint32_t tag; uint32_t value; } entry;
+
+    for (;;) {
+        int ret = read(fd, (char*)&entry, sizeof entry);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            D("Error while reading %s: %s\n", filepath, strerror(errno));
+            break;
+        }
+        // Detect end of list.
+        if (ret == 0 || (entry.tag == 0 && entry.value == 0))
+          break;
+        if (entry.tag == AT_HWCAP) {
+          result = entry.value;
+          break;
+        }
+    }
+    close(fd);
+    return result;
+#else
+    // Recreate ELF hwcaps by parsing /proc/cpuinfo Features tag.
+    uint32_t hwcaps = 0;
+
+    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "Features");
+
+    if (cpuFeatures != NULL) {
+        D("Found cpuFeatures = '%s'\n", cpuFeatures);
+
+        if (has_list_item(cpuFeatures, "vfp"))
+            hwcaps |= HWCAP_VFP;
+        if (has_list_item(cpuFeatures, "vfpv3"))
+            hwcaps |= HWCAP_VFPv3;
+        if (has_list_item(cpuFeatures, "vfpv3d16"))
+            hwcaps |= HWCAP_VFPv3D16;
+        if (has_list_item(cpuFeatures, "vfpv4"))
+            hwcaps |= HWCAP_VFPv4;
+        if (has_list_item(cpuFeatures, "neon"))
+            hwcaps |= HWCAP_NEON;
+        if (has_list_item(cpuFeatures, "idiva"))
+            hwcaps |= HWCAP_IDIVA;
+        if (has_list_item(cpuFeatures, "idivt"))
+            hwcaps |= HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "idiv"))
+            hwcaps |= HWCAP_IDIVA | HWCAP_IDIVT;
+        if (has_list_item(cpuFeatures, "iwmmxt"))
+            hwcaps |= HWCAP_IWMMXT;
+
+        free(cpuFeatures);
+    }
+    return hwcaps;
+#endif
+}
+#endif  /* __arm__ */
+
+/* Return the number of cpus present on a given device.
+ *
+ * To handle all weird kernel configurations, we need to compute the
+ * intersection of the 'present' and 'possible' CPU lists and count
+ * the result.
+ */
+static int
+get_cpu_count(void)
+{
+    CpuList cpus_present[1];
+    CpuList cpus_possible[1];
+
+    cpulist_read_from(cpus_present, "/sys/devices/system/cpu/present");
+    cpulist_read_from(cpus_possible, "/sys/devices/system/cpu/possible");
+
+    /* Compute the intersection of both sets to get the actual number of
+     * CPU cores that can be used on this device by the kernel.
+     */
+    cpulist_and(cpus_present, cpus_possible);
+
+    return cpulist_count(cpus_present);
+}
+
+static void
+android_cpuInitFamily(void)
+{
+#if defined(__ARM_ARCH__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_ARM;
+#elif defined(__i386__)
+    g_cpuFamily = ANDROID_CPU_FAMILY_X86;
+#elif defined(_MIPS_ARCH)
+    g_cpuFamily = ANDROID_CPU_FAMILY_MIPS;
+#else
+    g_cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+#endif
+}
+
+static void
+android_cpuInit(void)
+{
+    char* cpuinfo = NULL;
+    int   cpuinfo_len;
+
+    android_cpuInitFamily();
+
+    g_cpuFeatures = 0;
+    g_cpuCount    = 1;
+    g_inited      = 1;
+
+    cpuinfo_len = get_file_size("/proc/cpuinfo");
+    if (cpuinfo_len < 0) {
+      D("cpuinfo_len cannot be computed!");
+      return;
+    }
+    cpuinfo = malloc(cpuinfo_len);
+    if (cpuinfo == NULL) {
+      D("cpuinfo buffer could not be allocated");
+      return;
+    }
+    cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, cpuinfo_len);
+    D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
+      cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
+
+    if (cpuinfo_len < 0)  /* should not happen */ {
+        free(cpuinfo);
+        return;
+    }
+
+    /* Count the CPU cores, the value may be 0 for single-core CPUs */
+    g_cpuCount = get_cpu_count();
+    if (g_cpuCount == 0) {
+        g_cpuCount = 1;
+    }
+
+    D("found cpuCount = %d\n", g_cpuCount);
+
+#ifdef __ARM_ARCH__
+    {
+        char*  features = NULL;
+        char*  architecture = NULL;
+
+        /* Extract architecture from the "CPU Architecture" field.
+         * The list is well-known, unlike the the output of
+         * the 'Processor' field which can vary greatly.
+         *
+         * See the definition of the 'proc_arch' array in
+         * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
+         * same file.
+         */
+        char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, "CPU architecture");
+
+        if (cpuArch != NULL) {
+            char*  end;
+            long   archNumber;
+            int    hasARMv7 = 0;
+
+            D("found cpuArch = '%s'\n", cpuArch);
+
+            /* read the initial decimal number, ignore the rest */
+            archNumber = strtol(cpuArch, &end, 10);
+
+            /* Here we assume that ARMv8 will be upwards compatible with v7
+             * in the future. Unfortunately, there is no 'Features' field to
+             * indicate that Thumb-2 is supported.
+             */
+            if (end > cpuArch && archNumber >= 7) {
+                hasARMv7 = 1;
+            }
+
+            /* Unfortunately, it seems that certain ARMv6-based CPUs
+             * report an incorrect architecture number of 7!
+             *
+             * See http://code.google.com/p/android/issues/detail?id=10812
+             *
+             * We try to correct this by looking at the 'elf_format'
+             * field reported by the 'Processor' field, which is of the
+             * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
+             * an ARMv6-one.
+             */
+            if (hasARMv7) {
+                char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                                      "Processor");
+                if (cpuProc != NULL) {
+                    D("found cpuProc = '%s'\n", cpuProc);
+                    if (has_list_item(cpuProc, "(v6l)")) {
+                        D("CPU processor and architecture mismatch!!\n");
+                        hasARMv7 = 0;
+                    }
+                    free(cpuProc);
+                }
+            }
+
+            if (hasARMv7) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7;
+            }
+
+            /* The LDREX / STREX instructions are available from ARMv6 */
+            if (archNumber >= 6) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX;
+            }
+
+            free(cpuArch);
+        }
+
+        /* Extract the list of CPU features from ELF hwcaps */
+        uint32_t hwcaps = get_elf_hwcap(cpuinfo, cpuinfo_len);
+
+        if (hwcaps != 0) {
+            int has_vfp = (hwcaps & HWCAP_VFP);
+            int has_vfpv3 = (hwcaps & HWCAP_VFPv3);
+            int has_vfpv3d16 = (hwcaps & HWCAP_VFPv3D16);
+            int has_vfpv4 = (hwcaps & HWCAP_VFPv4);
+            int has_neon = (hwcaps & HWCAP_NEON);
+            int has_idiva = (hwcaps & HWCAP_IDIVA);
+            int has_idivt = (hwcaps & HWCAP_IDIVT);
+            int has_iwmmxt = (hwcaps & HWCAP_IWMMXT);
+
+            // The kernel does a poor job at ensuring consistency when
+            // describing CPU features. So lots of guessing is needed.
+
+            // 'vfpv4' implies VFPv3|VFP_FMA|FP16
+            if (has_vfpv4)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3    |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FP16 |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_FMA;
+
+            // 'vfpv3' or 'vfpv3d16' imply VFPv3. Note that unlike GCC,
+            // a value of 'vfpv3' doesn't necessarily mean that the D32
+            // feature is present, so be conservative. All CPUs in the
+            // field that support D32 also support NEON, so this should
+            // not be a problem in practice.
+            if (has_vfpv3 || has_vfpv3d16)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+
+            // 'vfp' is super ambiguous. Depending on the kernel, it can
+            // either mean VFPv2 or VFPv3. Make it depend on ARMv7.
+            if (has_vfp) {
+              if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3;
+              else
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2;
+            }
+
+            // Neon implies VFPv3|D32, and if vfpv4 is detected, NEON_FMA
+            if (has_neon) {
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3 |
+                                 ANDROID_CPU_ARM_FEATURE_NEON |
+                                 ANDROID_CPU_ARM_FEATURE_VFP_D32;
+              if (has_vfpv4)
+                  g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA;
+            }
+
+            // VFPv3 implies VFPv2 and ARMv7
+            if (g_cpuFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2 |
+                                 ANDROID_CPU_ARM_FEATURE_ARMv7;
+
+            // Note that some buggy kernels do not report these even when
+            // the CPU actually support the division instructions. However,
+            // assume that if 'vfpv4' is detected, then the CPU supports
+            // sdiv/udiv properly.
+            if (has_idiva || has_vfpv4)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM;
+            if (has_idivt || has_vfpv4)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2;
+
+            if (has_iwmmxt)
+                g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt;
+        }
+    }
+#endif /* __ARM_ARCH__ */
+
+#ifdef __i386__
+    int regs[4];
+
+/* According to http://en.wikipedia.org/wiki/CPUID */
+#define VENDOR_INTEL_b  0x756e6547
+#define VENDOR_INTEL_c  0x6c65746e
+#define VENDOR_INTEL_d  0x49656e69
+
+    x86_cpuid(0, regs);
+    int vendorIsIntel = (regs[1] == VENDOR_INTEL_b &&
+                         regs[2] == VENDOR_INTEL_c &&
+                         regs[3] == VENDOR_INTEL_d);
+
+    x86_cpuid(1, regs);
+    if ((regs[2] & (1 << 9)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3;
+    }
+    if ((regs[2] & (1 << 23)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT;
+    }
+    if (vendorIsIntel && (regs[2] & (1 << 22)) != 0) {
+        g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE;
+    }
+#endif
+
+    free(cpuinfo);
+}
+
+
+AndroidCpuFamily
+android_getCpuFamily(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFamily;
+}
+
+
+uint64_t
+android_getCpuFeatures(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuFeatures;
+}
+
+
+int
+android_getCpuCount(void)
+{
+    pthread_once(&g_once, android_cpuInit);
+    return g_cpuCount;
+}
+
+static void
+android_cpuInitDummy(void)
+{
+    g_inited = 1;
+}
+
+int
+android_setCpu(int cpu_count, uint64_t cpu_features)
+{
+    /* Fail if the library was already initialized. */
+    if (g_inited)
+        return 0;
+
+    android_cpuInitFamily();
+    g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count);
+    g_cpuFeatures = cpu_features;
+    pthread_once(&g_once, android_cpuInitDummy);
+
+    return 1;
+}
+
+/*
+ * Technical note: Making sense of ARM's FPU architecture versions.
+ *
+ * FPA was ARM's first attempt at an FPU architecture. There is no Android
+ * device that actually uses it since this technology was already obsolete
+ * when the project started. If you see references to FPA instructions
+ * somewhere, you can be sure that this doesn't apply to Android at all.
+ *
+ * FPA was followed by "VFP", soon renamed "VFPv1" due to the emergence of
+ * new versions / additions to it. ARM considers this obsolete right now,
+ * and no known Android device implements it either.
+ *
+ * VFPv2 added a few instructions to VFPv1, and is an *optional* extension
+ * supported by some ARMv5TE, ARMv6 and ARMv6T2 CPUs. Note that a device
+ * supporting the 'armeabi' ABI doesn't necessarily support these.
+ *
+ * VFPv3-D16 adds a few instructions on top of VFPv2 and is typically used
+ * on ARMv7-A CPUs which implement a FPU. Note that it is also mandated
+ * by the Android 'armeabi-v7a' ABI. The -D16 suffix in its name means
+ * that it provides 16 double-precision FPU registers (d0-d15) and 32
+ * single-precision ones (s0-s31) which happen to be mapped to the same
+ * register banks.
+ *
+ * VFPv3-D32 is the name of an extension to VFPv3-D16 that provides 16
+ * additional double precision registers (d16-d31). Note that there are
+ * still only 32 single precision registers.
+ *
+ * VFPv3xD is a *subset* of VFPv3-D16 that only provides single-precision
+ * registers. It is only used on ARMv7-M (i.e. on micro-controllers) which
+ * are not supported by Android. Note that it is not compatible with VFPv2.
+ *
+ * NOTE: The term 'VFPv3' usually designate either VFPv3-D16 or VFPv3-D32
+ *       depending on context. For example GCC uses it for VFPv3-D32, but
+ *       the Linux kernel code uses it for VFPv3-D16 (especially in
+ *       /proc/cpuinfo). Always try to use the full designation when
+ *       possible.
+ *
+ * NEON, a.k.a. "ARM Advanced SIMD" is an extension that provides
+ * instructions to perform parallel computations on vectors of 8, 16,
+ * 32, 64 and 128 bit quantities. NEON requires VFPv32-D32 since all
+ * NEON registers are also mapped to the same register banks.
+ *
+ * VFPv4-D16, adds a few instructions on top of VFPv3-D16 in order to
+ * perform fused multiply-accumulate on VFP registers, as well as
+ * half-precision (16-bit) conversion operations.
+ *
+ * VFPv4-D32 is VFPv4-D16 with 32, instead of 16, FPU double precision
+ * registers.
+ *
+ * VPFv4-NEON is VFPv4-D32 with NEON instructions. It also adds fused
+ * multiply-accumulate instructions that work on the NEON registers.
+ *
+ * NOTE: Similarly, "VFPv4" might either reference VFPv4-D16 or VFPv4-D32
+ *       depending on context.
+ *
+ * The following information was determined by scanning the binutils-2.22
+ * sources:
+ *
+ * Basic VFP instruction subsets:
+ *
+ * #define FPU_VFP_EXT_V1xD 0x08000000     // Base VFP instruction set.
+ * #define FPU_VFP_EXT_V1   0x04000000     // Double-precision insns.
+ * #define FPU_VFP_EXT_V2   0x02000000     // ARM10E VFPr1.
+ * #define FPU_VFP_EXT_V3xD 0x01000000     // VFPv3 single-precision.
+ * #define FPU_VFP_EXT_V3   0x00800000     // VFPv3 double-precision.
+ * #define FPU_NEON_EXT_V1  0x00400000     // Neon (SIMD) insns.
+ * #define FPU_VFP_EXT_D32  0x00200000     // Registers D16-D31.
+ * #define FPU_VFP_EXT_FP16 0x00100000     // Half-precision extensions.
+ * #define FPU_NEON_EXT_FMA 0x00080000     // Neon fused multiply-add
+ * #define FPU_VFP_EXT_FMA  0x00040000     // VFP fused multiply-add
+ *
+ * FPU types (excluding NEON)
+ *
+ * FPU_VFP_V1xD (EXT_V1xD)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V1 (+EXT_V1)       FPU_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *    |                          |
+ *    |                          |
+ * FPU_VFP_V2 (+EXT_V2)       FPU_VFP_V4_SP_D16 (+EXT_FP16+EXT_FMA)
+ *    |
+ * FPU_VFP_V3D16 (+EXT_Vx3D+EXT_V3)
+ *    |
+ *    +--------------------------+
+ *    |                          |
+ * FPU_VFP_V3 (+EXT_D32)     FPU_VFP_V4D16 (+EXT_FP16+EXT_FMA)
+ *    |                          |
+ *    |                      FPU_VFP_V4 (+EXT_D32)
+ *    |
+ * FPU_VFP_HARD (+EXT_FMA+NEON_EXT_FMA)
+ *
+ * VFP architectures:
+ *
+ * ARCH_VFP_V1xD  (EXT_V1xD)
+ *   |
+ *   +------------------+
+ *   |                  |
+ *   |             ARCH_VFP_V3xD (+EXT_V2+EXT_V3xD)
+ *   |                  |
+ *   |             ARCH_VFP_V3xD_FP16 (+EXT_FP16)
+ *   |                  |
+ *   |             ARCH_VFP_V4_SP_D16 (+EXT_FMA)
+ *   |
+ * ARCH_VFP_V1 (+EXT_V1)
+ *   |
+ * ARCH_VFP_V2 (+EXT_V2)
+ *   |
+ * ARCH_VFP_V3D16 (+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ * -fpu=<name> values and their correspondance with FPU architectures above:
+ *
+ *   {"vfp",               FPU_ARCH_VFP_V2},
+ *   {"vfp9",              FPU_ARCH_VFP_V2},
+ *   {"vfp3",              FPU_ARCH_VFP_V3}, // For backwards compatbility.
+ *   {"vfp10",             FPU_ARCH_VFP_V2},
+ *   {"vfp10-r0",          FPU_ARCH_VFP_V1},
+ *   {"vfpxd",             FPU_ARCH_VFP_V1xD},
+ *   {"vfpv2",             FPU_ARCH_VFP_V2},
+ *   {"vfpv3",             FPU_ARCH_VFP_V3},
+ *   {"vfpv3-fp16",        FPU_ARCH_VFP_V3_FP16},
+ *   {"vfpv3-d16",         FPU_ARCH_VFP_V3D16},
+ *   {"vfpv3-d16-fp16",    FPU_ARCH_VFP_V3D16_FP16},
+ *   {"vfpv3xd",           FPU_ARCH_VFP_V3xD},
+ *   {"vfpv3xd-fp16",      FPU_ARCH_VFP_V3xD_FP16},
+ *   {"neon",              FPU_ARCH_VFP_V3_PLUS_NEON_V1},
+ *   {"neon-fp16",         FPU_ARCH_NEON_FP16},
+ *   {"vfpv4",             FPU_ARCH_VFP_V4},
+ *   {"vfpv4-d16",         FPU_ARCH_VFP_V4D16},
+ *   {"fpv4-sp-d16",       FPU_ARCH_VFP_V4_SP_D16},
+ *   {"neon-vfpv4",        FPU_ARCH_NEON_VFP_V4},
+ *
+ *
+ * Simplified diagram that only includes FPUs supported by Android:
+ * Only ARCH_VFP_V3D16 is actually mandated by the armeabi-v7a ABI,
+ * all others are optional and must be probed at runtime.
+ *
+ * ARCH_VFP_V3D16 (EXT_V1xD+EXT_V1+EXT_V2+EXT_V3xD+EXT_V3)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3D16_FP16  (+EXT_FP16)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V4_D16 (+EXT_FP16+EXT_FMA)
+ *   |                   |
+ *   |         ARCH_VFP_V4 (+EXT_D32)
+ *   |                   |
+ *   |         ARCH_NEON_VFP_V4 (+EXT_NEON+EXT_NEON_FMA)
+ *   |
+ * ARCH_VFP_V3 (+EXT_D32)
+ *   |
+ *   +-------------------+
+ *   |                   |
+ *   |         ARCH_VFP_V3_FP16 (+EXT_FP16)
+ *   |
+ * ARCH_VFP_V3_PLUS_NEON_V1 (+EXT_NEON)
+ *   |
+ * ARCH_NEON_FP16 (+EXT_FP16)
+ *
+ */
--- a/3rdparty/libwebp/cpu-features/cpu-features.h
+++ b/3rdparty/libwebp/cpu-features/cpu-features.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef CPU_FEATURES_H
+#define CPU_FEATURES_H
+
+#include <sys/cdefs.h>
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+typedef enum {
+    ANDROID_CPU_FAMILY_UNKNOWN = 0,
+    ANDROID_CPU_FAMILY_ARM,
+    ANDROID_CPU_FAMILY_X86,
+    ANDROID_CPU_FAMILY_MIPS,
+
+    ANDROID_CPU_FAMILY_MAX  /* do not remove */
+
+} AndroidCpuFamily;
+
+/* Return family of the device's CPU */
+extern AndroidCpuFamily   android_getCpuFamily(void);
+
+/* The list of feature flags for ARM CPUs that can be recognized by the
+ * library. Value details are:
+ *
+ *   VFPv2:
+ *     CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs
+ *     support these instructions. VFPv2 is a subset of VFPv3 so this will
+ *     be set whenever VFPv3 is set too.
+ *
+ *   ARMv7:
+ *     CPU supports the ARMv7-A basic instruction set.
+ *     This feature is mandated by the 'armeabi-v7a' ABI.
+ *
+ *   VFPv3:
+ *     CPU supports the VFPv3-D16 instruction set, providing hardware FPU
+ *     support for single and double precision floating point registers.
+ *     Note that only 16 FPU registers are available by default, unless
+ *     the D32 bit is set too. This feature is also mandated by the
+ *     'armeabi-v7a' ABI.
+ *
+ *   VFP_D32:
+ *     CPU VFP optional extension that provides 32 FPU registers,
+ *     instead of 16. Note that ARM mandates this feature is the 'NEON'
+ *     feature is implemented by the CPU.
+ *
+ *   NEON:
+ *     CPU FPU supports "ARM Advanced SIMD" instructions, also known as
+ *     NEON. Note that this mandates the VFP_D32 feature as well, per the
+ *     ARM Architecture specification.
+ *
+ *   VFP_FP16:
+ *     Half-width floating precision VFP extension. If set, the CPU
+ *     supports instructions to perform floating-point operations on
+ *     16-bit registers. This is part of the VFPv4 specification, but
+ *     not mandated by any Android ABI.
+ *
+ *   VFP_FMA:
+ *     Fused multiply-accumulate VFP instructions extension. Also part of
+ *     the VFPv4 specification, but not mandated by any Android ABI.
+ *
+ *   NEON_FMA:
+ *     Fused multiply-accumulate NEON instructions extension. Optional
+ *     extension from the VFPv4 specification, but not mandated by any
+ *     Android ABI.
+ *
+ *   IDIV_ARM:
+ *     Integer division available in ARM mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   IDIV_THUMB2:
+ *     Integer division available in Thumb-2 mode. Only available
+ *     on recent CPUs (e.g. Cortex-A15).
+ *
+ *   iWMMXt:
+ *     Optional extension that adds MMX registers and operations to an
+ *     ARM CPU. This is only available on a few XScale-based CPU designs
+ *     sold by Marvell. Pretty rare in practice.
+ *
+ * If you want to tell the compiler to generate code that targets one of
+ * the feature set above, you should probably use one of the following
+ * flags (for more details, see technical note at the end of this file):
+ *
+ *   -mfpu=vfp
+ *   -mfpu=vfpv2
+ *     These are equivalent and tell GCC to use VFPv2 instructions for
+ *     floating-point operations. Use this if you want your code to
+ *     run on *some* ARMv6 devices, and any ARMv7-A device supported
+ *     by Android.
+ *
+ *     Generated code requires VFPv2 feature.
+ *
+ *   -mfpu=vfpv3-d16
+ *     Tell GCC to use VFPv3 instructions (using only 16 FPU registers).
+ *     This should be generic code that runs on any CPU that supports the
+ *     'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this.
+ *
+ *     Generated code requires VFPv3 feature.
+ *
+ *   -mfpu=vfpv3
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers.
+ *     Generated code requires VFPv3|VFP_D32 features.
+ *
+ *   -mfpu=neon
+ *     Tell GCC to use VFPv3 instructions with 32 FPU registers, and
+ *     also support NEON intrinsics (see <arm_neon.h>).
+ *     Generated code requires VFPv3|VFP_D32|NEON features.
+ *
+ *   -mfpu=vfpv4-d16
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA features.
+ *
+ *   -mfpu=vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features.
+ *
+ *   -mfpu=neon-vfpv4
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA
+ *     features.
+ *
+ *   -mcpu=cortex-a7
+ *   -mcpu=cortex-a15
+ *     Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|
+ *                             NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2
+ *     This flag implies -mfpu=neon-vfpv4.
+ *
+ *   -mcpu=iwmmxt
+ *     Allows the use of iWMMXt instrinsics with GCC.
+ */
+enum {
+    ANDROID_CPU_ARM_FEATURE_ARMv7       = (1 << 0),
+    ANDROID_CPU_ARM_FEATURE_VFPv3       = (1 << 1),
+    ANDROID_CPU_ARM_FEATURE_NEON        = (1 << 2),
+    ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3),
+    ANDROID_CPU_ARM_FEATURE_VFPv2       = (1 << 4),
+    ANDROID_CPU_ARM_FEATURE_VFP_D32     = (1 << 5),
+    ANDROID_CPU_ARM_FEATURE_VFP_FP16    = (1 << 6),
+    ANDROID_CPU_ARM_FEATURE_VFP_FMA     = (1 << 7),
+    ANDROID_CPU_ARM_FEATURE_NEON_FMA    = (1 << 8),
+    ANDROID_CPU_ARM_FEATURE_IDIV_ARM    = (1 << 9),
+    ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10),
+    ANDROID_CPU_ARM_FEATURE_iWMMXt      = (1 << 11),
+};
+
+enum {
+    ANDROID_CPU_X86_FEATURE_SSSE3  = (1 << 0),
+    ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1),
+    ANDROID_CPU_X86_FEATURE_MOVBE  = (1 << 2),
+};
+
+extern uint64_t    android_getCpuFeatures(void);
+
+/* Return the number of CPU cores detected on this device. */
+extern int         android_getCpuCount(void);
+
+/* The following is used to force the CPU count and features
+ * mask in sandboxed processes. Under 4.1 and higher, these processes
+ * cannot access /proc, which is the only way to get information from
+ * the kernel about the current hardware (at least on ARM).
+ *
+ * It _must_ be called only once, and before any android_getCpuXXX
+ * function, any other case will fail.
+ *
+ * This function return 1 on success, and 0 on failure.
+ */
+extern int android_setCpu(int      cpu_count,
+                          uint64_t cpu_features);
+
+__END_DECLS
+
+#endif /* CPU_FEATURES_H */
--- a/3rdparty/libwebp/dec/alpha.c
+++ b/3rdparty/libwebp/dec/alpha.c
@@ -0,0 +1,129 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane decompression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "../utils/filters.h"
+#include "../utils/quant_levels_dec.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// TODO(skal): move to dsp/ ?
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
+// The 'output' buffer should be pre-allocated and must be of the same
+// dimension 'height'x'stride', as that of the image.
+//
+// Returns 1 on successfully decoding the compressed alpha and
+//         0 if either:
+//           error in bit-stream header (invalid compression mode or filter), or
+//           error returned by appropriate compression method.
+
+static int DecodeAlpha(const uint8_t* data, size_t data_size,
+                       int width, int height, int stride, uint8_t* output) {
+  uint8_t* decoded_data = NULL;
+  const size_t decoded_size = height * width;
+  WEBP_FILTER_TYPE filter;
+  int pre_processing;
+  int rsrv;
+  int ok = 0;
+  int method;
+
+  assert(width > 0 && height > 0 && stride >= width);
+  assert(data != NULL && output != NULL);
+
+  if (data_size <= ALPHA_HEADER_LEN) {
+    return 0;
+  }
+
+  method = (data[0] >> 0) & 0x03;
+  filter = (data[0] >> 2) & 0x03;
+  pre_processing = (data[0] >> 4) & 0x03;
+  rsrv = (data[0] >> 6) & 0x03;
+  if (method < ALPHA_NO_COMPRESSION ||
+      method > ALPHA_LOSSLESS_COMPRESSION ||
+      filter >= WEBP_FILTER_LAST ||
+      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+      rsrv != 0) {
+    return 0;
+  }
+
+  if (method == ALPHA_NO_COMPRESSION) {
+    ok = (data_size >= decoded_size);
+    decoded_data = (uint8_t*)data + ALPHA_HEADER_LEN;
+  } else {
+    decoded_data = (uint8_t*)malloc(decoded_size);
+    if (decoded_data == NULL) return 0;
+    ok = VP8LDecodeAlphaImageStream(width, height,
+                                    data + ALPHA_HEADER_LEN,
+                                    data_size - ALPHA_HEADER_LEN,
+                                    decoded_data);
+  }
+
+  if (ok) {
+    WebPUnfilterFunc unfilter_func = WebPUnfilters[filter];
+    if (unfilter_func != NULL) {
+      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
+      // and apply filter per image-row.
+      unfilter_func(width, height, width, decoded_data);
+    }
+    // Construct raw_data (height x stride) from alpha data (height x width).
+    CopyPlane(decoded_data, width, output, stride, width, height);
+    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
+      ok = DequantizeLevels(decoded_data, width, height);
+    }
+  }
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    free(decoded_data);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows) {
+  const int stride = dec->pic_hdr_.width_;
+
+  if (row < 0 || num_rows < 0 || row + num_rows > dec->pic_hdr_.height_) {
+    return NULL;    // sanity check.
+  }
+
+  if (row == 0) {
+    // Decode everything during the first call.
+    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
+                     dec->pic_hdr_.width_, dec->pic_hdr_.height_, stride,
+                     dec->alpha_plane_)) {
+      return NULL;  // Error.
+    }
+  }
+
+  // Return a pointer to the current decoded row.
+  return dec->alpha_plane_ + row * stride;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/buffer.c
+++ b/3rdparty/libwebp/dec/buffer.c
@@ -0,0 +1,215 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./webpi.h"
+#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer
+
+// Number of bytes per pixel for the different color-spaces.
+static const int kModeBpp[MODE_LAST] = {
+  3, 4, 3, 4, 4, 2, 2,
+  4, 4, 4, 2,    // pre-multiplied modes
+  1, 1 };
+
+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+// Convert to an integer to handle both the unsigned/signed enum cases
+// without the need for casting to remove type limit warnings.
+static int IsValidColorspace(int webp_csp_mode) {
+  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
+}
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (!IsValidColorspace(mode)) {
+    ok = 0;
+  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const uint64_t y_size = (uint64_t)buf->y_stride * height;
+    const uint64_t u_size = (uint64_t)buf->u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)buf->v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)buf->a_stride * height;
+    ok &= (y_size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (buf->y_stride >= width);
+    ok &= (buf->u_stride >= (width + 1) / 2);
+    ok &= (buf->v_stride >= (width + 1) / 2);
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
+      ok &= (buf->a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    const uint64_t size = (uint64_t)buf->stride * height;
+    ok &= (size <= buf->size);
+    ok &= (buf->stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+
+  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
+    uint8_t* output;
+    int uv_stride = 0, a_stride = 0;
+    uint64_t uv_size = 0, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;
+
+    if (!WebPIsRGBMode(mode)) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    buffer->private_memory = output;
+
+    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = (size_t)size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = (size_t)uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = (size_t)uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = (size_t)a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = (size_t)size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real
+  return AllocateBuffer(out);
+}
+
+//------------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // version mismatch
+  }
+  if (buffer == NULL) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
+  if (buffer != NULL) {
+    if (!buffer->is_external_memory)
+      free(buffer->private_memory);
+    buffer->private_memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src != NULL && dst != NULL) {
+    *dst = *src;
+    if (src->private_memory != NULL) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->private_memory = NULL;
+    }
+  }
+}
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
+  if (src != NULL && dst != NULL) {
+    *dst = *src;
+    if (src->private_memory != NULL) {
+      src->is_external_memory = 1;   // src relinquishes ownership
+      src->private_memory = NULL;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/decode_vp8.h
+++ b/3rdparty/libwebp/dec/decode_vp8.h
@@ -0,0 +1,182 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//  Low-level API for VP8 decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_DECODE_VP8_H_
+#define WEBP_WEBP_DECODE_VP8_H_
+
+#include "../webp/decode.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Lower-level API
+//
+// These functions provide fine-grained control of the decoding process.
+// The call flow should resemble:
+//
+//   VP8Io io;
+//   VP8InitIo(&io);
+//   io.data = data;
+//   io.data_size = size;
+//   /* customize io's functions (setup()/put()/teardown()) if needed. */
+//
+//   VP8Decoder* dec = VP8New();
+//   bool ok = VP8Decode(dec);
+//   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
+//   VP8Delete(dec);
+//   return ok;
+
+// Input / Output
+typedef struct VP8Io VP8Io;
+typedef int (*VP8IoPutHook)(const VP8Io* io);
+typedef int (*VP8IoSetupHook)(VP8Io* io);
+typedef void (*VP8IoTeardownHook)(const VP8Io* io);
+
+struct VP8Io {
+  // set by VP8GetHeaders()
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
+
+  // set before calling put()
+  int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
+  int mb_h;                  // number of rows in the sample
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
+  int y_stride;              // row stride for luma
+  int uv_stride;             // row stride for chroma
+
+  void* opaque;              // user data
+
+  // called when fresh samples are available. Currently, samples are in
+  // YUV420 format, and can be up to width x 24 in size (depending on the
+  // in-loop filtering level, e.g.). Should return false in case of error
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
+  VP8IoPutHook put;
+
+  // called just before starting to decode the blocks.
+  // Must return false in case of setup error, true otherwise. If false is
+  // returned, teardown() will NOT be called. But if the setup succeeded
+  // and true is returned, then teardown() will always be called afterward.
+  VP8IoSetupHook setup;
+
+  // Called just after block decoding is finished (or when an error occurred
+  // during put()). Is NOT called if setup() failed.
+  VP8IoTeardownHook teardown;
+
+  // this is a recommendation for the user-side yuv->rgb converter. This flag
+  // is set when calling setup() hook and can be overwritten by it. It then
+  // can be taken into consideration during the put() method.
+  int fancy_upsampling;
+
+  // Input buffer.
+  size_t data_size;
+  const uint8_t* data;
+
+  // If true, in-loop filtering will not be performed even if present in the
+  // bitstream. Switching off filtering may speed up decoding at the expense
+  // of more visible blocking. Note that output will also be non-compliant
+  // with the VP8 specifications.
+  int bypass_filtering;
+
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
+  // If non NULL, pointer to the alpha data (if present) corresponding to the
+  // start of the current row (That is: it is pre-offset by mb_y and takes
+  // cropping into account).
+  const uint8_t* a;
+};
+
+// Internal, version-checked, entry point
+int VP8InitIoInternal(VP8Io* const, int);
+
+// Set the custom IO function pointers and user-data. The setter for IO hooks
+// should be called before initiating incremental decoding. Returns true if
+// WebPIDecoder object is successfully modified, false otherwise.
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data);
+
+// Main decoding object. This is an opaque structure.
+typedef struct VP8Decoder VP8Decoder;
+
+// Create a new decoder object.
+VP8Decoder* VP8New(void);
+
+// Must be called to make sure 'io' is initialized properly.
+// Returns false in case of version mismatch. Upon such failure, no other
+// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
+static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
+}
+
+// Start decoding a new picture. Returns true if ok.
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+
+// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
+// Returns false in case of error.
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+
+// Return current status of the decoder:
+VP8StatusCode VP8Status(VP8Decoder* const dec);
+
+// return readable string corresponding to the last status.
+const char* VP8StatusMessage(VP8Decoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Not a mandatory call between calls to VP8Decode().
+void VP8Clear(VP8Decoder* const dec);
+
+// Destroy the decoder object.
+void VP8Delete(VP8Decoder* const dec);
+
+//------------------------------------------------------------------------------
+// Miscellaneous VP8/VP8L bitstream probing functions.
+
+// Returns true if the next 3 bytes in data contain the VP8 signature.
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+
+// Validates the VP8 data-header and retrieves basic header information viz
+// width and height. Returns 0 in case of formatting error. *width/*height
+// can be passed NULL.
+WEBP_EXTERN(int) VP8GetInfo(
+    const uint8_t* data,
+    size_t data_size,    // data available so far
+    size_t chunk_size,   // total data size expected in the chunk
+    int* const width, int* const height);
+
+// Returns true if the next byte(s) in data is a VP8L signature.
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+
+// Validates the VP8L data-header and retrieves basic header information viz
+// width, height and alpha. Returns 0 in case of formatting error.
+// width/height/has_alpha can be passed NULL.
+WEBP_EXTERN(int) VP8LGetInfo(
+    const uint8_t* data, size_t data_size,  // data available so far
+    int* const width, int* const height, int* const has_alpha);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
--- a/3rdparty/libwebp/dec/frame.c
+++ b/3rdparty/libwebp/dec/frame.c
@@ -0,0 +1,692 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Frame-reconstruction function. Memory allocation.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./vp8i.h"
+#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define ALIGN_MASK (32 - 1)
+
+//------------------------------------------------------------------------------
+// Filtering
+
+// kFilterExtraRows[] = How many extra lines are needed on the MB boundary
+// for caching, given a filtering level.
+// Simple filter:  up to 2 luma samples are read and 1 is written.
+// Complex filter: up to 4 luma samples are read and 3 are written. Same for
+//                 U/V, so it's 8 samples total (because of the 2x upsampling).
+static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
+
+static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
+  if (keyframe) {
+    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+  } else {
+    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
+  }
+}
+
+static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int y_bps = dec->cache_y_stride_;
+  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
+  const int level = f_info->f_level_;
+  const int ilevel = f_info->f_ilevel_;
+  const int limit = 2 * level + ilevel;
+  if (level == 0) {
+    return;
+  }
+  if (dec->filter_type_ == 1) {   // simple
+    if (mb_x > 0) {
+      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (f_info->f_inner_) {
+      VP8SimpleHFilter16i(y_dst, y_bps, limit);
+    }
+    if (mb_y > 0) {
+      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (f_info->f_inner_) {
+      VP8SimpleVFilter16i(y_dst, y_bps, limit);
+    }
+  } else {    // complex
+    const int uv_bps = dec->cache_uv_stride_;
+    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh =
+        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
+    if (mb_x > 0) {
+      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (f_info->f_inner_) {
+      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+    if (mb_y > 0) {
+      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (f_info->f_inner_) {
+      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+  }
+}
+
+// Filter the decoded macroblock row (if needed)
+static void FilterRow(const VP8Decoder* const dec) {
+  int mb_x;
+  const int mb_y = dec->thread_ctx_.mb_y_;
+  assert(dec->thread_ctx_.filter_row_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    DoFilter(dec, mb_x, mb_y);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
+
+static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
+  if (dec->filter_type_ > 0) {
+    int s;
+    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
+    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+      int i4x4;
+      // First, compute the initial level
+      int base_level;
+      if (dec->segment_hdr_.use_segment_) {
+        base_level = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          base_level += hdr->level_;
+        }
+      } else {
+        base_level = hdr->level_;
+      }
+      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
+        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
+        int level = base_level;
+        if (hdr->use_lf_delta_) {
+          // TODO(skal): only CURRENT is handled for now.
+          level += hdr->ref_lf_delta_[0];
+          if (i4x4) {
+            level += hdr->mode_lf_delta_[0];
+          }
+        }
+        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+        info->f_level_ = level;
+
+        if (hdr->sharpness_ > 0) {
+          if (hdr->sharpness_ > 4) {
+            level >>= 2;
+          } else {
+            level >>= 1;
+          }
+          if (level > 9 - hdr->sharpness_) {
+            level = 9 - hdr->sharpness_;
+          }
+        }
+        info->f_ilevel_ = (level < 1) ? 1 : level;
+        info->f_inner_ = 0;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// This function is called after a row of macroblocks is finished decoding.
+// It also takes into account the following restrictions:
+//  * In case of in-loop filtering, we must hold off sending some of the bottom
+//    pixels as they are yet unfiltered. They will be when the next macroblock
+//    row is decoded. Meanwhile, we must preserve them by rotating them in the
+//    cache area. This doesn't hold for the very bottom row of the uncropped
+//    picture of course.
+//  * we must clip the remaining pixels against the cropping area. The VP8Io
+//    struct must have the following fields set correctly before calling put():
+
+#define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
+
+// Finalize and transmit a complete row. Return false in case of user-abort.
+static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
+  const int ysize = extra_y_rows * dec->cache_y_stride_;
+  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
+  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
+  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
+  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
+  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
+  const int first_row = (ctx->mb_y_ == 0);
+  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
+  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
+  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+
+  if (ctx->filter_row_) {
+    FilterRow(dec);
+  }
+
+  if (io->put) {
+    if (!first_row) {
+      y_start -= extra_y_rows;
+      io->y = ydst;
+      io->u = udst;
+      io->v = vdst;
+    } else {
+      io->y = dec->cache_y_ + y_offset;
+      io->u = dec->cache_u_ + uv_offset;
+      io->v = dec->cache_v_ + uv_offset;
+    }
+
+    if (!last_row) {
+      y_end -= extra_y_rows;
+    }
+    if (y_end > io->crop_bottom) {
+      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
+    }
+    io->a = NULL;
+    if (dec->alpha_data_ != NULL && y_start < y_end) {
+      // TODO(skal): several things to correct here:
+      // * testing presence of alpha with dec->alpha_data_ is not a good idea
+      // * we're actually decompressing the full plane only once. It should be
+      //   more obvious from signature.
+      // * we could free alpha_data_ right after this call, but we don't own.
+      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
+      if (io->a == NULL) {
+        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                           "Could not decode alpha data.");
+      }
+    }
+    if (y_start < io->crop_top) {
+      const int delta_y = io->crop_top - y_start;
+      y_start = io->crop_top;
+      assert(!(delta_y & 1));
+      io->y += dec->cache_y_stride_ * delta_y;
+      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
+      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
+      if (io->a != NULL) {
+        io->a += io->width * delta_y;
+      }
+    }
+    if (y_start < y_end) {
+      io->y += io->crop_left;
+      io->u += io->crop_left >> 1;
+      io->v += io->crop_left >> 1;
+      if (io->a != NULL) {
+        io->a += io->crop_left;
+      }
+      io->mb_y = y_start - io->crop_top;
+      io->mb_w = io->crop_right - io->crop_left;
+      io->mb_h = y_end - y_start;
+      ok = io->put(io);
+    }
+  }
+  // rotate top samples if needed
+  if (ctx->id_ + 1 == dec->num_caches_) {
+    if (!last_row) {
+      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
+      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
+      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
+    }
+  }
+
+  return ok;
+}
+
+#undef MACROBLOCK_VPOS
+
+//------------------------------------------------------------------------------
+
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  if (!dec->use_threads_) {
+    // ctx->id_ and ctx->f_info_ are already set
+    ctx->mb_y_ = dec->mb_y_;
+    ctx->filter_row_ = dec->filter_row_;
+    ok = FinishRow(dec, io);
+  } else {
+    WebPWorker* const worker = &dec->worker_;
+    // Finish previous job *before* updating context
+    ok &= WebPWorkerSync(worker);
+    assert(worker->status_ == OK);
+    if (ok) {   // spawn a new deblocking/output job
+      ctx->io_ = *io;
+      ctx->id_ = dec->cache_id_;
+      ctx->mb_y_ = dec->mb_y_;
+      ctx->filter_row_ = dec->filter_row_;
+      if (ctx->filter_row_) {    // just swap filter info
+        VP8FInfo* const tmp = ctx->f_info_;
+        ctx->f_info_ = dec->f_info_;
+        dec->f_info_ = tmp;
+      }
+      WebPWorkerLaunch(worker);
+      if (++dec->cache_id_ == dec->num_caches_) {
+        dec->cache_id_ = 0;
+      }
+    }
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Finish setting up the decoding parameter once user's setup() is called.
+
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
+  // Call setup() first. This may trigger additional decoding features on 'io'.
+  // Note: Afterward, we must call teardown() not matter what.
+  if (io->setup && !io->setup(io)) {
+    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
+    return dec->status_;
+  }
+
+  // Disable filtering per user request
+  if (io->bypass_filtering) {
+    dec->filter_type_ = 0;
+  }
+  // TODO(skal): filter type / strength / sharpness forcing
+
+  // Define the area where we can skip in-loop filtering, in case of cropping.
+  //
+  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // and filters one. It doesn't filter the chroma samples. Hence, we can
+  // avoid doing the in-loop filtering before crop_top/crop_left position.
+  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
+  // Means: there's a dependency chain that goes all the way up to the
+  // top-left corner of the picture (MB #0). We must filter all the previous
+  // macroblocks.
+  // TODO(skal): add an 'approximate_decoding' option, that won't produce
+  // a 1:1 bit-exactness for complex filtering?
+  {
+    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
+    if (dec->filter_type_ == 2) {
+      // For complex filter, we need to preserve the dependency chain.
+      dec->tl_mb_x_ = 0;
+      dec->tl_mb_y_ = 0;
+    } else {
+      // For simple filter, we can filter only the cropped region.
+      // We include 'extra_pixels' on the other side of the boundary, since
+      // vertical or horizontal filtering of the previous macroblock can
+      // modify some abutting pixels.
+      dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
+      dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
+      if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
+      if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
+    }
+    // We need some 'extra' pixels on the right/bottom.
+    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
+    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
+    if (dec->br_mb_x_ > dec->mb_w_) {
+      dec->br_mb_x_ = dec->mb_w_;
+    }
+    if (dec->br_mb_y_ > dec->mb_h_) {
+      dec->br_mb_y_ = dec->mb_h_;
+    }
+  }
+  PrecomputeFilterStrengths(dec);
+  return VP8_STATUS_OK;
+}
+
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  if (dec->use_threads_) {
+    ok = WebPWorkerSync(&dec->worker_);
+  }
+
+  if (io->teardown) {
+    io->teardown(io);
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
+//
+// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
+// immediately, and needs to wait for first few rows of the next macroblock to
+// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
+// on strength).
+// With two threads, the vertical positions of the rows being decoded are:
+// Decode:  [ 0..15][16..31][32..47][48..63][64..79][...
+// Deblock:         [ 0..11][12..27][28..43][44..59][...
+// If we use two threads and two caches of 16 pixels, the sequence would be:
+// Decode:  [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
+// Deblock:         [ 0..11][12..27!!][-4..11][12..27][...
+// The problem occurs during row [12..15!!] that both the decoding and
+// deblocking threads are writing simultaneously.
+// With 3 cache lines, one get a safe write pattern:
+// Decode:  [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
+// Deblock:         [ 0..11][12..27][28..43][-4..11][12..27][28...
+// Note that multi-threaded output _without_ deblocking can make use of two
+// cache lines of 16 pixels only, since there's no lagging behind. The decoding
+// and output process have non-concurrent writing:
+// Decode:  [ 0..15][16..31][ 0..15][16..31][...
+// io->put:         [ 0..15][16..31][ 0..15][...
+
+#define MT_CACHE_LINES 3
+#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
+
+// Initialize multi/single-thread worker
+static int InitThreadContext(VP8Decoder* const dec) {
+  dec->cache_id_ = 0;
+  if (dec->use_threads_) {
+    WebPWorker* const worker = &dec->worker_;
+    if (!WebPWorkerReset(worker)) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "thread initialization failed.");
+    }
+    worker->data1 = dec;
+    worker->data2 = (void*)&dec->thread_ctx_.io_;
+    worker->hook = (WebPWorkerHook)FinishRow;
+    dec->num_caches_ =
+      (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
+  } else {
+    dec->num_caches_ = ST_CACHE_LINES;
+  }
+  return 1;
+}
+
+#undef MT_CACHE_LINES
+#undef ST_CACHE_LINES
+
+//------------------------------------------------------------------------------
+// Memory setup
+
+static int AllocateMemory(VP8Decoder* const dec) {
+  const int num_caches = dec->num_caches_;
+  const int mb_w = dec->mb_w_;
+  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
+  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
+  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
+  const size_t f_info_size =
+      (dec->filter_type_ > 0) ?
+          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+        : 0;
+  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
+  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t cache_height = (16 * num_caches
+                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const size_t cache_size = top_size * cache_height;
+  // alpha_size is the only one that scales as width x height.
+  const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
+      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
+  const uint64_t needed = (uint64_t)intra_pred_mode_size
+                        + top_size + mb_info_size + f_info_size
+                        + yuv_size + coeffs_size
+                        + cache_size + alpha_size + ALIGN_MASK;
+  uint8_t* mem;
+
+  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (needed > dec->mem_size_) {
+    free(dec->mem_);
+    dec->mem_size_ = 0;
+    dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
+    if (dec->mem_ == NULL) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "no memory during frame initialization.");
+    }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    dec->mem_size_ = (size_t)needed;
+  }
+
+  mem = (uint8_t*)dec->mem_;
+  dec->intra_t_ = (uint8_t*)mem;
+  mem += intra_pred_mode_size;
+
+  dec->y_t_ = (uint8_t*)mem;
+  mem += 16 * mb_w;
+  dec->u_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+  dec->v_t_ = (uint8_t*)mem;
+  mem += 8 * mb_w;
+
+  dec->mb_info_ = ((VP8MB*)mem) + 1;
+  mem += mb_info_size;
+
+  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
+  mem += f_info_size;
+  dec->thread_ctx_.id_ = 0;
+  dec->thread_ctx_.f_info_ = dec->f_info_;
+  if (dec->use_threads_) {
+    // secondary cache line. The deblocking process need to make use of the
+    // filtering strength from previous macroblock row, while the new ones
+    // are being decoded in parallel. We'll just swap the pointers.
+    dec->thread_ctx_.f_info_ += mb_w;
+  }
+
+  mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK);
+  assert((yuv_size & ALIGN_MASK) == 0);
+  dec->yuv_b_ = (uint8_t*)mem;
+  mem += yuv_size;
+
+  dec->coeffs_ = (int16_t*)mem;
+  mem += coeffs_size;
+
+  dec->cache_y_stride_ = 16 * mb_w;
+  dec->cache_uv_stride_ = 8 * mb_w;
+  {
+    const int extra_rows = kFilterExtraRows[dec->filter_type_];
+    const int extra_y = extra_rows * dec->cache_y_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_u_ = dec->cache_y_
+                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_
+                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_id_ = 0;
+  }
+  mem += cache_size;
+
+  // alpha plane
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  mem += alpha_size;
+  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
+
+  // note: left-info is initialized once for all.
+  memset(dec->mb_info_ - 1, 0, mb_info_size);
+
+  // initialize top
+  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
+
+  return 1;
+}
+
+static void InitIo(VP8Decoder* const dec, VP8Io* io) {
+  // prepare 'io'
+  io->mb_y = 0;
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->a = NULL;
+}
+
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
+  if (!AllocateMemory(dec)) return 0;
+  InitIo(dec, io);
+  VP8DspInit();  // Init critical function pointers and look-up tables.
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
+  if (mode == B_DC_PRED) {
+    if (dec->mb_x_ == 0) {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
+
+static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
+  *(uint32_t*)dst = *(uint32_t*)src;
+}
+
+void VP8ReconstructBlock(VP8Decoder* const dec) {
+  int j;
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Rotate in the left samples from previously decoded block. We move four
+  // pixels at a time for alignment reason, and because of in-loop filter.
+  if (dec->mb_x_ > 0) {
+    for (j = -1; j < 16; ++j) {
+      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+    }
+    for (j = -1; j < 8; ++j) {
+      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+    }
+  } else {
+    for (j = 0; j < 16; ++j) {
+      y_dst[j * BPS - 1] = 129;
+    }
+    for (j = 0; j < 8; ++j) {
+      u_dst[j * BPS - 1] = 129;
+      v_dst[j * BPS - 1] = 129;
+    }
+    // Init top-left sample on left column too
+    if (dec->mb_y_ > 0) {
+      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+    }
+  }
+  {
+    // bring top samples into the cache
+    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
+    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
+    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
+    const int16_t* coeffs = dec->coeffs_;
+    int n;
+
+    if (dec->mb_y_ > 0) {
+      memcpy(y_dst - BPS, top_y, 16);
+      memcpy(u_dst - BPS, top_u, 8);
+      memcpy(v_dst - BPS, top_v, 8);
+    } else if (dec->mb_x_ == 0) {
+      // we only need to do this init once at block (0,0).
+      // Afterward, it remains valid for the whole topmost row.
+      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+      memset(u_dst - BPS - 1, 127, 8 + 1);
+      memset(v_dst - BPS - 1, 127, 8 + 1);
+    }
+
+    // predict and add residuals
+
+    if (dec->is_i4x4_) {   // 4x4
+      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+      if (dec->mb_y_ > 0) {
+        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
+          top_right[0] = top_y[15] * 0x01010101u;
+        } else {
+          memcpy(top_right, top_y + 16, sizeof(*top_right));
+        }
+      }
+      // replicate the top-right pixels below
+      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+      // predict and add residues for all 4x4 blocks in turn.
+      for (n = 0; n < 16; n++) {
+        uint8_t* const dst = y_dst + kScan[n];
+        VP8PredLuma4[dec->imodes_[n]](dst);
+        if (dec->non_zero_ac_ & (1 << n)) {
+          VP8Transform(coeffs + n * 16, dst, 0);
+        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+          VP8TransformDC(coeffs + n * 16, dst);
+        }
+      }
+    } else {    // 16x16
+      const int pred_func = CheckMode(dec, dec->imodes_[0]);
+      VP8PredLuma16[pred_func](y_dst);
+      if (dec->non_zero_) {
+        for (n = 0; n < 16; n++) {
+          uint8_t* const dst = y_dst + kScan[n];
+          if (dec->non_zero_ac_ & (1 << n)) {
+            VP8Transform(coeffs + n * 16, dst, 0);
+          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
+            VP8TransformDC(coeffs + n * 16, dst);
+          }
+        }
+      }
+    }
+    {
+      // Chroma
+      const int pred_func = CheckMode(dec, dec->uvmode_);
+      VP8PredChroma8[pred_func](u_dst);
+      VP8PredChroma8[pred_func](v_dst);
+
+      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
+        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
+        if (dec->non_zero_ac_ & 0x0f0000) {
+          VP8TransformUV(u_coeffs, u_dst);
+        } else {
+          VP8TransformDCUV(u_coeffs, u_dst);
+        }
+      }
+      if (dec->non_zero_ & 0xf00000) {   // chroma-V
+        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
+        if (dec->non_zero_ac_ & 0xf00000) {
+          VP8TransformUV(v_coeffs, v_dst);
+        } else {
+          VP8TransformDCUV(v_coeffs, v_dst);
+        }
+      }
+
+      // stash away top samples for next block
+      if (dec->mb_y_ < dec->mb_h_ - 1) {
+        memcpy(top_y, y_dst + 15 * BPS, 16);
+        memcpy(top_u, u_dst +  7 * BPS,  8);
+        memcpy(top_v, v_dst +  7 * BPS,  8);
+      }
+    }
+  }
+  // Transfer reconstructed samples from yuv_b_ cache to final destination.
+  {
+    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
+    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
+    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
+    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
+    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
+    for (j = 0; j < 16; ++j) {
+      memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+    }
+    for (j = 0; j < 8; ++j) {
+      memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+      memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/idec.c
+++ b/3rdparty/libwebp/dec/idec.c
@@ -0,0 +1,814 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Incremental decoding
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "./webpi.h"
+#include "./vp8i.h"
+#include "../utils/utils.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+// In append mode, buffer allocations increase as multiples of this value.
+// Needs to be a power of 2.
+#define CHUNK_SIZE 4096
+#define MAX_MB_SIZE 4096
+
+//------------------------------------------------------------------------------
+// Data structures for memory and states
+
+// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
+// If there is any error the decoder goes into state ERROR.
+typedef enum {
+  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
+  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_VP8_PARTS0,
+  STATE_VP8_DATA,
+  STATE_VP8L_HEADER,
+  STATE_VP8L_DATA,
+  STATE_DONE,
+  STATE_ERROR
+} DecState;
+
+// Operating state for the MemBuffer
+typedef enum {
+  MEM_MODE_NONE = 0,
+  MEM_MODE_APPEND,
+  MEM_MODE_MAP
+} MemBufferMode;
+
+// storage for partition #0 and partial data (in a rolling fashion)
+typedef struct {
+  MemBufferMode mode_;  // Operation mode
+  size_t start_;        // start location of the data to be decoded
+  size_t end_;          // end location
+  size_t buf_size_;     // size of the allocated buffer
+  uint8_t* buf_;        // We don't own this buffer in case WebPIUpdate()
+
+  size_t part0_size_;         // size of partition #0
+  const uint8_t* part0_buf_;  // buffer to store partition #0
+} MemBuffer;
+
+struct WebPIDecoder {
+  DecState state_;         // current decoding state
+  WebPDecParams params_;   // Params to store output info
+  int is_lossless_;        // for down-casting 'dec_'.
+  void* dec_;              // either a VP8Decoder or a VP8LDecoder instance
+  VP8Io io_;
+
+  MemBuffer mem_;          // input memory buffer.
+  WebPDecBuffer output_;   // output buffer (when no external one is supplied)
+  size_t chunk_size_;      // Compressed VP8/VP8L size extracted from Header.
+};
+
+// MB context to restore in case VP8DecodeMB() fails
+typedef struct {
+  VP8MB left_;
+  VP8MB info_;
+  uint8_t intra_t_[4];
+  uint8_t intra_l_[4];
+  VP8BitReader br_;
+  VP8BitReader token_br_;
+} MBContext;
+
+//------------------------------------------------------------------------------
+// MemBuffer: incoming data handling
+
+static void RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+  }
+}
+
+static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
+  return (mem->end_ - mem->start_);
+}
+
+static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* const new_base = mem->buf_ + mem->start_;
+  // note: for VP8, setting up idec->io_ is only really needed at the beginning
+  // of the decoding, till partition #0 is complete.
+  idec->io_.data = new_base;
+  idec->io_.data_size = MemDataSize(mem);
+
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+      const int last_part = dec->num_parts_ - 1;
+      if (offset != 0) {
+        int p;
+        for (p = 0; p <= last_part; ++p) {
+          RemapBitReader(dec->parts_ + p, offset);
+        }
+        // Remap partition #0 data pointer to new offset, but only in MAP
+        // mode (in APPEND mode, partition #0 is copied into a fixed memory).
+        if (mem->mode_ == MEM_MODE_MAP) {
+          RemapBitReader(&dec->br_, offset);
+        }
+      }
+      assert(last_part >= 0);
+      dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
+    } else {    // Resize lossless bitreader
+      VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+      VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
+    }
+  }
+}
+
+// Appends data to the end of MemBuffer->buf_. It expands the allocated memory
+// size if required and also updates VP8BitReader's if new memory is allocated.
+static int AppendToMemBuffer(WebPIDecoder* const idec,
+                             const uint8_t* const data, size_t data_size) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  assert(mem->mode_ == MEM_MODE_APPEND);
+  if (data_size > MAX_CHUNK_PAYLOAD) {
+    // security safeguard: trying to allocate more than what the format
+    // allows for a chunk should be considered a smoke smell.
+    return 0;
+  }
+
+  if (mem->end_ + data_size > mem->buf_size_) {  // Need some free memory
+    const size_t current_size = MemDataSize(mem);
+    const uint64_t new_size = (uint64_t)current_size + data_size;
+    const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
+    uint8_t* const new_buf =
+        (uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
+    if (new_buf == NULL) return 0;
+    memcpy(new_buf, old_base, current_size);
+    free(mem->buf_);
+    mem->buf_ = new_buf;
+    mem->buf_size_ = (size_t)extra_size;
+    mem->start_ = 0;
+    mem->end_ = current_size;
+  }
+
+  memcpy(mem->buf_ + mem->end_, data, data_size);
+  mem->end_ += data_size;
+  assert(mem->end_ <= mem->buf_size_);
+
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  return 1;
+}
+
+static int RemapMemBuffer(WebPIDecoder* const idec,
+                          const uint8_t* const data, size_t data_size) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* const old_base = mem->buf_ + mem->start_;
+  assert(mem->mode_ == MEM_MODE_MAP);
+
+  if (data_size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
+
+  mem->buf_ = (uint8_t*)data;
+  mem->end_ = mem->buf_size_ = data_size;
+
+  DoRemap(idec, mem->buf_ + mem->start_ - old_base);
+  return 1;
+}
+
+static void InitMemBuffer(MemBuffer* const mem) {
+  mem->mode_       = MEM_MODE_NONE;
+  mem->buf_        = NULL;
+  mem->buf_size_   = 0;
+  mem->part0_buf_  = NULL;
+  mem->part0_size_ = 0;
+}
+
+static void ClearMemBuffer(MemBuffer* const mem) {
+  assert(mem);
+  if (mem->mode_ == MEM_MODE_APPEND) {
+    free(mem->buf_);
+    free((void*)mem->part0_buf_);
+  }
+}
+
+static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
+  if (mem->mode_ == MEM_MODE_NONE) {
+    mem->mode_ = expected;    // switch to the expected mode
+  } else if (mem->mode_ != expected) {
+    return 0;         // we mixed the modes => error
+  }
+  assert(mem->mode_ == expected);   // mode is ok
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Macroblock-decoding contexts
+
+static void SaveContext(const VP8Decoder* dec, const VP8BitReader* token_br,
+                        MBContext* const context) {
+  const VP8BitReader* const br = &dec->br_;
+  const VP8MB* const left = dec->mb_info_ - 1;
+  const VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+
+  context->left_ = *left;
+  context->info_ = *info;
+  context->br_ = *br;
+  context->token_br_ = *token_br;
+  memcpy(context->intra_t_, dec->intra_t_ + 4 * dec->mb_x_, 4);
+  memcpy(context->intra_l_, dec->intra_l_, 4);
+}
+
+static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
+                           VP8BitReader* const token_br) {
+  VP8BitReader* const br = &dec->br_;
+  VP8MB* const left = dec->mb_info_ - 1;
+  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+
+  *left = context->left_;
+  *info = context->info_;
+  *br = context->br_;
+  *token_br = context->token_br_;
+  memcpy(dec->intra_t_ + 4 * dec->mb_x_, context->intra_t_, 4);
+  memcpy(dec->intra_l_, context->intra_l_, 4);
+}
+
+//------------------------------------------------------------------------------
+
+static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
+  if (idec->state_ == STATE_VP8_DATA) {
+    VP8Io* const io = &idec->io_;
+    if (io->teardown) {
+      io->teardown(io);
+    }
+  }
+  idec->state_ = STATE_ERROR;
+  return error;
+}
+
+static void ChangeState(WebPIDecoder* const idec, DecState new_state,
+                        size_t consumed_bytes) {
+  MemBuffer* const mem = &idec->mem_;
+  idec->state_ = new_state;
+  mem->start_ += consumed_bytes;
+  assert(mem->start_ <= mem->end_);
+  idec->io_.data = mem->buf_ + mem->start_;
+  idec->io_.data_size = MemDataSize(mem);
+}
+
+// Headers
+static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
+  MemBuffer* const mem = &idec->mem_;
+  const uint8_t* data = mem->buf_ + mem->start_;
+  size_t curr_size = MemDataSize(mem);
+  VP8StatusCode status;
+  WebPHeaderStructure headers;
+
+  headers.data = data;
+  headers.data_size = curr_size;
+  status = WebPParseHeaders(&headers);
+  if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;  // We haven't found a VP8 chunk yet.
+  } else if (status != VP8_STATUS_OK) {
+    return IDecError(idec, status);
+  }
+
+  idec->chunk_size_ = headers.compressed_size;
+  idec->is_lossless_ = headers.is_lossless;
+  if (!idec->is_lossless_) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = (idec->params_.options != NULL) &&
+                        (idec->params_.options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    idec->dec_ = dec;
+    ChangeState(idec, STATE_VP8L_HEADER, headers.offset);
+  }
+  return VP8_STATUS_OK;
+}
+
+static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
+  const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
+  const size_t curr_size = MemDataSize(&idec->mem_);
+  uint32_t bits;
+
+  if (curr_size < VP8_FRAME_HEADER_SIZE) {
+    // Not enough data bytes to extract VP8 Frame Header.
+    return VP8_STATUS_SUSPENDED;
+  }
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
+    return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+  }
+
+  bits = data[0] | (data[1] << 8) | (data[2] << 16);
+  idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;
+
+  idec->io_.data = data;
+  idec->io_.data_size = curr_size;
+  idec->state_ = STATE_VP8_PARTS0;
+  return VP8_STATUS_OK;
+}
+
+// Partition #0
+static int CopyParts0Data(WebPIDecoder* const idec) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+  VP8BitReader* const br = &dec->br_;
+  const size_t psize = br->buf_end_ - br->buf_;
+  MemBuffer* const mem = &idec->mem_;
+  assert(!idec->is_lossless_);
+  assert(mem->part0_buf_ == NULL);
+  assert(psize > 0);
+  assert(psize <= mem->part0_size_);  // Format limit: no need for runtime check
+  if (mem->mode_ == MEM_MODE_APPEND) {
+    // We copy and grab ownership of the partition #0 data.
+    uint8_t* const part0_buf = (uint8_t*)malloc(psize);
+    if (part0_buf == NULL) {
+      return 0;
+    }
+    memcpy(part0_buf, br->buf_, psize);
+    mem->part0_buf_ = part0_buf;
+    br->buf_ = part0_buf;
+    br->buf_end_ = part0_buf + psize;
+  } else {
+    // Else: just keep pointers to the partition #0's data in dec_->br_.
+  }
+  mem->start_ += psize;
+  return 1;
+}
+
+static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+  VP8Io* const io = &idec->io_;
+  const WebPDecParams* const params = &idec->params_;
+  WebPDecBuffer* const output = params->output;
+
+  // Wait till we have enough data for the whole partition #0
+  if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
+    return VP8_STATUS_SUSPENDED;
+  }
+
+  if (!VP8GetHeaders(dec, io)) {
+    const VP8StatusCode status = dec->status_;
+    if (status == VP8_STATUS_SUSPENDED ||
+        status == VP8_STATUS_NOT_ENOUGH_DATA) {
+      // treating NOT_ENOUGH_DATA as SUSPENDED state
+      return VP8_STATUS_SUSPENDED;
+    }
+    return IDecError(idec, status);
+  }
+
+  // Allocate/Verify output buffer now
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  if (!CopyParts0Data(idec)) {
+    return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
+  }
+
+  // Finish setting up the decoding parameters. Will call io->setup().
+  if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  // Note: past this point, teardown() must always be called
+  // in case of error.
+  idec->state_ = STATE_VP8_DATA;
+  // Allocate memory and prepare everything.
+  if (!VP8InitFrame(dec, io)) {
+    return IDecError(idec, dec->status_);
+  }
+  return VP8_STATUS_OK;
+}
+
+// Remaining partitions
+static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
+  VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
+  VP8Io* const io = &idec->io_;
+
+  assert(dec->ready_);
+
+  for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
+    VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    if (dec->mb_x_ == 0) {
+      VP8InitScanline(dec);
+    }
+    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+      MBContext context;
+      SaveContext(dec, token_br, &context);
+
+      if (!VP8DecodeMB(dec, token_br)) {
+        RestoreContext(&context, dec, token_br);
+        // We shouldn't fail when MAX_MB data was available
+        if (dec->num_parts_ == 1 && MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
+          return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
+        }
+        return VP8_STATUS_SUSPENDED;
+      }
+      // Reconstruct and emit samples.
+      VP8ReconstructBlock(dec);
+
+      // Release buffer only if there is only one partition
+      if (dec->num_parts_ == 1) {
+        idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
+        assert(idec->mem_.start_ <= idec->mem_.end_);
+      }
+    }
+    if (!VP8ProcessRow(dec, io)) {
+      return IDecError(idec, VP8_STATUS_USER_ABORT);
+    }
+    dec->mb_x_ = 0;
+  }
+  // Synchronize the thread and check for errors.
+  if (!VP8ExitCritical(dec, io)) {
+    return IDecError(idec, VP8_STATUS_USER_ABORT);
+  }
+  dec->ready_ = 0;
+  idec->state_ = STATE_DONE;
+
+  return VP8_STATUS_OK;
+}
+
+static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+  if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  return IDecError(idec, status);
+}
+
+static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {
+  VP8Io* const io = &idec->io_;
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const WebPDecParams* const params = &idec->params_;
+  WebPDecBuffer* const output = params->output;
+  size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // Wait until there's enough data for decoding header.
+  if (curr_size < (idec->chunk_size_ >> 3)) {
+    return VP8_STATUS_SUSPENDED;
+  }
+  if (!VP8LDecodeHeader(dec, io)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+  // Allocate/verify output buffer now.
+  dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
+                                       output);
+  if (dec->status_ != VP8_STATUS_OK) {
+    return IDecError(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_VP8L_DATA;
+  return VP8_STATUS_OK;
+}
+
+static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
+  VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
+  const size_t curr_size = MemDataSize(&idec->mem_);
+  assert(idec->is_lossless_);
+
+  // At present Lossless decoder can't decode image incrementally. So wait till
+  // all the image data is aggregated before image can be decoded.
+  if (curr_size < idec->chunk_size_) {
+    return VP8_STATUS_SUSPENDED;
+  }
+
+  if (!VP8LDecodeImage(dec)) {
+    return ErrorStatusLossless(idec, dec->status_);
+  }
+
+  idec->state_ = STATE_DONE;
+
+  return VP8_STATUS_OK;
+}
+
+  // Main decoding loop
+static VP8StatusCode IDecode(WebPIDecoder* idec) {
+  VP8StatusCode status = VP8_STATUS_SUSPENDED;
+
+  if (idec->state_ == STATE_PRE_VP8) {
+    status = DecodeWebPHeaders(idec);
+  } else {
+    if (idec->dec_ == NULL) {
+      return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
+    }
+  }
+  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+    status = DecodeVP8FrameHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8_PARTS0) {
+    status = DecodePartition0(idec);
+  }
+  if (idec->state_ == STATE_VP8_DATA) {
+    status = DecodeRemaining(idec);
+  }
+  if (idec->state_ == STATE_VP8L_HEADER) {
+    status = DecodeVP8LHeader(idec);
+  }
+  if (idec->state_ == STATE_VP8L_DATA) {
+    status = DecodeVP8LData(idec);
+  }
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// Public functions
+
+WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
+  WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(*idec));
+  if (idec == NULL) {
+    return NULL;
+  }
+
+  idec->state_ = STATE_PRE_VP8;
+  idec->chunk_size_ = 0;
+
+  InitMemBuffer(&idec->mem_);
+  WebPInitDecBuffer(&idec->output_);
+  VP8InitIo(&idec->io_);
+
+  WebPResetDecParams(&idec->params_);
+  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
+
+  return idec;
+}
+
+WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
+                          WebPDecoderConfig* config) {
+  WebPIDecoder* idec;
+
+  // Parse the bitstream's features, if requested:
+  if (data != NULL && data_size > 0 && config != NULL) {
+    if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) {
+      return NULL;
+    }
+  }
+  // Create an instance of the incremental decoder
+  idec = WebPINewDecoder(config ? &config->output : NULL);
+  if (idec == NULL) {
+    return NULL;
+  }
+  // Finish initialization
+  if (config != NULL) {
+    idec->params_.options = &config->options;
+  }
+  return idec;
+}
+
+void WebPIDelete(WebPIDecoder* idec) {
+  if (idec == NULL) return;
+  if (idec->dec_ != NULL) {
+    if (!idec->is_lossless_) {
+      VP8Delete(idec->dec_);
+    } else {
+      VP8LDelete(idec->dec_);
+    }
+  }
+  ClearMemBuffer(&idec->mem_);
+  WebPFreeDecBuffer(&idec->output_);
+  free(idec);
+}
+
+//------------------------------------------------------------------------------
+// Wrapper toward WebPINewDecoder
+
+WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
+                          size_t output_buffer_size, int output_stride) {
+  const int is_external_memory = (output_buffer != NULL);
+  WebPIDecoder* idec;
+
+  if (mode >= MODE_YUV) return NULL;
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    output_buffer_size = 0;
+    output_stride = 0;
+  } else {  // A buffer was passed. Validate the other params.
+    if (output_stride == 0 || output_buffer_size == 0) {
+      return NULL;   // invalid parameter.
+    }
+  }
+  idec = WebPINewDecoder(NULL);
+  if (idec == NULL) return NULL;
+  idec->output_.colorspace = mode;
+  idec->output_.is_external_memory = is_external_memory;
+  idec->output_.u.RGBA.rgba = output_buffer;
+  idec->output_.u.RGBA.stride = output_stride;
+  idec->output_.u.RGBA.size = output_buffer_size;
+  return idec;
+}
+
+WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride,
+                           uint8_t* a, size_t a_size, int a_stride) {
+  const int is_external_memory = (luma != NULL);
+  WebPIDecoder* idec;
+  WEBP_CSP_MODE colorspace;
+
+  if (!is_external_memory) {    // Overwrite parameters to sane values.
+    luma_size = u_size = v_size = a_size = 0;
+    luma_stride = u_stride = v_stride = a_stride = 0;
+    u = v = a = NULL;
+    colorspace = MODE_YUVA;
+  } else {  // A luma buffer was passed. Validate the other parameters.
+    if (u == NULL || v == NULL) return NULL;
+    if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
+    if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
+    if (a != NULL) {
+      if (a_size == 0 || a_stride == 0) return NULL;
+    }
+    colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
+  }
+
+  idec = WebPINewDecoder(NULL);
+  if (idec == NULL) return NULL;
+
+  idec->output_.colorspace = colorspace;
+  idec->output_.is_external_memory = is_external_memory;
+  idec->output_.u.YUVA.y = luma;
+  idec->output_.u.YUVA.y_stride = luma_stride;
+  idec->output_.u.YUVA.y_size = luma_size;
+  idec->output_.u.YUVA.u = u;
+  idec->output_.u.YUVA.u_stride = u_stride;
+  idec->output_.u.YUVA.u_size = u_size;
+  idec->output_.u.YUVA.v = v;
+  idec->output_.u.YUVA.v_stride = v_stride;
+  idec->output_.u.YUVA.v_size = v_size;
+  idec->output_.u.YUVA.a = a;
+  idec->output_.u.YUVA.a_stride = a_stride;
+  idec->output_.u.YUVA.a_size = a_size;
+  return idec;
+}
+
+WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
+                          uint8_t* u, size_t u_size, int u_stride,
+                          uint8_t* v, size_t v_size, int v_stride) {
+  return WebPINewYUVA(luma, luma_size, luma_stride,
+                      u, u_size, u_stride,
+                      v, v_size, v_stride,
+                      NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+
+static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
+  assert(idec);
+  if (idec->state_ == STATE_ERROR) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+  if (idec->state_ == STATE_DONE) {
+    return VP8_STATUS_OK;
+  }
+  return VP8_STATUS_SUSPENDED;
+}
+
+VP8StatusCode WebPIAppend(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
+  VP8StatusCode status;
+  if (idec == NULL || data == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  status = IDecCheckStatus(idec);
+  if (status != VP8_STATUS_SUSPENDED) {
+    return status;
+  }
+  // Check mixed calls between RemapMemBuffer and AppendToMemBuffer.
+  if (!CheckMemBufferMode(&idec->mem_, MEM_MODE_APPEND)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  // Append data to memory buffer
+  if (!AppendToMemBuffer(idec, data, data_size)) {
+    return VP8_STATUS_OUT_OF_MEMORY;
+  }
+  return IDecode(idec);
+}
+
+VP8StatusCode WebPIUpdate(WebPIDecoder* idec,
+                          const uint8_t* data, size_t data_size) {
+  VP8StatusCode status;
+  if (idec == NULL || data == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  status = IDecCheckStatus(idec);
+  if (status != VP8_STATUS_SUSPENDED) {
+    return status;
+  }
+  // Check mixed calls between RemapMemBuffer and AppendToMemBuffer.
+  if (!CheckMemBufferMode(&idec->mem_, MEM_MODE_MAP)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  // Make the memory buffer point to the new buffer
+  if (!RemapMemBuffer(idec, data, data_size)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  return IDecode(idec);
+}
+
+//------------------------------------------------------------------------------
+
+static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
+  if (idec == NULL || idec->dec_ == NULL) {
+    return NULL;
+  }
+  if (idec->state_ <= STATE_VP8_PARTS0) {
+    return NULL;
+  }
+  return idec->params_.output;
+}
+
+const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
+                                      int* left, int* top,
+                                      int* width, int* height) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (left != NULL) *left = 0;
+  if (top != NULL) *top = 0;
+  // TODO(skal): later include handling of rotations.
+  if (src) {
+    if (width != NULL) *width = src->width;
+    if (height != NULL) *height = idec->params_.last_y;
+  } else {
+    if (width != NULL) *width = 0;
+    if (height != NULL) *height = 0;
+  }
+  return src;
+}
+
+uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
+                        int* width, int* height, int* stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (src == NULL) return NULL;
+  if (src->colorspace >= MODE_YUV) {
+    return NULL;
+  }
+
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.RGBA.stride;
+
+  return src->u.RGBA.rgba;
+}
+
+uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
+                         uint8_t** u, uint8_t** v, uint8_t** a,
+                         int* width, int* height,
+                         int* stride, int* uv_stride, int* a_stride) {
+  const WebPDecBuffer* const src = GetOutputBuffer(idec);
+  if (src == NULL) return NULL;
+  if (src->colorspace < MODE_YUV) {
+    return NULL;
+  }
+
+  if (last_y != NULL) *last_y = idec->params_.last_y;
+  if (u != NULL) *u = src->u.YUVA.u;
+  if (v != NULL) *v = src->u.YUVA.v;
+  if (a != NULL) *a = src->u.YUVA.a;
+  if (width != NULL) *width = src->width;
+  if (height != NULL) *height = src->height;
+  if (stride != NULL) *stride = src->u.YUVA.y_stride;
+  if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
+  if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;
+
+  return src->u.YUVA.y;
+}
+
+int WebPISetIOHooks(WebPIDecoder* const idec,
+                    VP8IoPutHook put,
+                    VP8IoSetupHook setup,
+                    VP8IoTeardownHook teardown,
+                    void* user_data) {
+  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
+    return 0;
+  }
+
+  idec->io_.put = put;
+  idec->io_.setup = setup;
+  idec->io_.teardown = teardown;
+  idec->io_.opaque = user_data;
+
+  return 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/io.c
+++ b/3rdparty/libwebp/dec/io.c
@@ -0,0 +1,633 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../dec/vp8i.h"
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Main YUV<->RGB conversion functions
+
+static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  const int uv_w = (mb_w + 1) / 2;
+  const int uv_h = (mb_h + 1) / 2;
+  int j;
+  for (j = 0; j < mb_h; ++j) {
+    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
+  }
+  for (j = 0; j < uv_h; ++j) {
+    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
+    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
+  }
+  return io->mb_h;
+}
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h - 1;
+  int j;
+  for (j = 0; j < last; j += 2) {
+    sample(y_src, y_src + io->y_stride, u_src, v_src,
+           dst, dst + buf->stride, mb_w);
+    y_src += 2 * io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += 2 * buf->stride;
+  }
+  if (j == last) {  // Just do the last line twice
+    sample(y_src, y_src, u_src, v_src, dst, dst, mb_w);
+  }
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// YUV444 -> RGB conversion
+
+#if 0   // TODO(skal): this is for future rescaling.
+static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* output = p->output;
+  const WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  const uint8_t* y_src = io->y;
+  const uint8_t* u_src = io->u;
+  const uint8_t* v_src = io->v;
+  const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace];
+  const int mb_w = io->mb_w;
+  const int last = io->mb_h;
+  int j;
+  for (j = 0; j < last; ++j) {
+    convert(y_src, u_src, v_src, dst, mb_w);
+    y_src += io->y_stride;
+    u_src += io->uv_stride;
+    v_src += io->uv_stride;
+    dst += buf->stride;
+  }
+  return io->mb_h;
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  const int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    ++num_lines_out;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+               dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  int j;
+
+  if (alpha != NULL) {
+    for (j = 0; j < mb_h; ++j) {
+      memcpy(dst, alpha, mb_w * sizeof(*dst));
+      alpha += io->width;
+      dst += buf->a_stride;
+    }
+  } else if (buf->a != NULL) {
+    // the user requested alpha, but there is none, set it to opaque.
+    for (j = 0; j < mb_h; ++j) {
+      memset(dst, 0xff, mb_w * sizeof(*dst));
+      dst += buf->a_stride;
+    }
+  }
+  return 0;
+}
+
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+    uint32_t alpha_mask = 0xff;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        const uint32_t alpha_value = alpha[i];
+        dst[4 * i] = alpha_value;
+        alpha_mask &= alpha_value;
+      }
+      alpha += io->width;
+      dst += buf->stride;
+    }
+    // alpha_mask is < 0xff if there's non-trivial alpha to premultiply with.
+    if (alpha_mask != 0xff && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                             mb_w, num_rows, buf->stride);
+    }
+  }
+  return 0;
+}
+
+static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* alpha_dst = base_rgba + 1;
+    uint32_t alpha_mask = 0x0f;
+    int i, j;
+
+    for (j = 0; j < num_rows; ++j) {
+      for (i = 0; i < mb_w; ++i) {
+        // Fill in the alpha value (converted to 4 bits).
+        const uint32_t alpha_value = alpha[i] >> 4;
+        alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+        alpha_mask &= alpha_value;
+      }
+      alpha += io->width;
+      alpha_dst += buf->stride;
+    }
+    if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
+    }
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// YUV rescaling (no final RGB conversion needed)
+
+static int Rescale(const uint8_t* src, int src_stride,
+                   int new_lines, WebPRescaler* const wrk) {
+  int num_lines_out = 0;
+  while (new_lines > 0) {    // import new contributions of source rows.
+    const int lines_in = WebPRescalerImport(wrk, new_lines, src, src_stride);
+    src += lines_in * src_stride;
+    new_lines -= lines_in;
+    num_lines_out += WebPRescalerExport(wrk);    // emit output row(s)
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y);
+  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a != NULL) {
+    Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
+  }
+  return 0;
+}
+
+static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
+  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_out_width  = (out_width + 1) >> 1;
+  const int uv_out_height = (out_height + 1) >> 1;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
+  const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
+  size_t tmp_size;
+  int32_t* work;
+
+  tmp_size = work_size + 2 * uv_work_size;
+  if (has_alpha) {
+    tmp_size += work_size;
+  }
+  p->memory = calloc(1, tmp_size * sizeof(*work));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   buf->y, out_width, out_height, buf->y_stride, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
+                   uv_in_width, uv_out_width,
+                   uv_in_height, uv_out_height,
+                   work + work_size + uv_work_size);
+  p->emit = EmitRescaledYUV;
+
+  if (has_alpha) {
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     buf->a, out_width, out_height, buf->a_stride, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + work_size + 2 * uv_work_size);
+    p->emit_alpha = EmitRescaledAlphaYUV;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const WebPYUV444Converter convert =
+      WebPYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
+         WebPRescalerHasPendingOutput(&p->scaler_u)) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    WebPRescalerExportRow(&p->scaler_y);
+    WebPRescalerExportRow(&p->scaler_u);
+    WebPRescalerExportRow(&p->scaler_v);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in =
+        WebPRescalerImport(&p->scaler_y, mb_h - j,
+                           io->y + j * io->y_stride, io->y_stride);
+    const int u_lines_in =
+        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                           io->u + uv_j * io->uv_stride, io->uv_stride);
+    const int v_lines_in =
+        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                           io->v + uv_j * io->uv_stride, io->uv_stride);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int alpha_first =
+      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+  int num_lines_out = 0;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
+  const int width = p->scaler_a.dst_width;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0xff) {
+    WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                           width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  uint8_t* alpha_dst = base_rgba + 1;
+  int num_lines_out = 0;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int width = p->scaler_a.dst_width;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0x0f;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a);
+    for (i = 0; i < width; ++i) {
+      // Fill in the alpha value (converted to 4 bits).
+      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha_dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0x0f) {
+    WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a != NULL) {
+    WebPRescaler* const scaler = &p->scaler_a;
+    int j = 0;
+    int pos = 0;
+    while (j < io->mb_h) {
+      j += WebPRescalerImport(scaler, io->mb_h - j,
+                              io->a + j * io->width, io->width);
+      pos += p->emit_alpha_row(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  p->memory = calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp));
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   tmp + 0 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work + 0 * work_size);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   tmp + 1 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 1 * work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   tmp + 2 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+
+  if (has_alpha) {
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     tmp + 3 * out_width, out_width, out_height, 0, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+    if (p->output->colorspace == MODE_RGBA_4444 ||
+        p->output->colorspace == MODE_rgbA_4444) {
+      p->emit_alpha_row = ExportAlphaRGBA4444;
+    } else {
+      p->emit_alpha_row = ExportAlpha;
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int is_rgb = WebPIsRGBMode(colorspace);
+  const int is_alpha = WebPIsAlphaMode(colorspace);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  p->emit_alpha_row = NULL;
+  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
+    return 0;
+  }
+
+  if (io->use_scaling) {
+    const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+#ifdef FANCY_UPSAMPLING
+      if (io->fancy_upsampling) {
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = malloc(io->mb_w + 2 * uv_width);
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+        WebPInitUpsamplers();
+      }
+#endif
+    } else {
+      p->emit = EmitYUV;
+    }
+    if (is_alpha) {  // need transparency output
+      if (WebPIsPremultipliedMode(colorspace)) WebPInitPremultiply();
+      p->emit_alpha =
+          (colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
+              EmitAlphaRGBA4444
+          : is_rgb ? EmitAlphaRGB
+          : EmitAlphaYUV;
+    }
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  free(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/layer.c
+++ b/3rdparty/libwebp/dec/layer.c
@@ -0,0 +1,35 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Enhancement layer (for YUV444/422)
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+
+int VP8DecodeLayer(VP8Decoder* const dec) {
+  assert(dec);
+  assert(dec->layer_data_size_ > 0);
+  (void)dec;
+
+  // TODO: handle enhancement layer here.
+
+  return 1;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/quant.c
+++ b/3rdparty/libwebp/dec/quant.c
@@ -0,0 +1,113 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Quantizer initialization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+static WEBP_INLINE int clip(int v, int M) {
+  return v < 0 ? 0 : v > M ? M : v;
+}
+
+// Paragraph 14.1
+static const uint8_t kDcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  10,
+  11,   12,  13,  14,  15,  16,  17,  17,
+  18,   19,  20,  20,  21,  21,  22,  22,
+  23,   23,  24,  25,  25,  26,  27,  28,
+  29,   30,  31,  32,  33,  34,  35,  36,
+  37,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  46,  47,  48,  49,  50,
+  51,   52,  53,  54,  55,  56,  57,  58,
+  59,   60,  61,  62,  63,  64,  65,  66,
+  67,   68,  69,  70,  71,  72,  73,  74,
+  75,   76,  76,  77,  78,  79,  80,  81,
+  82,   83,  84,  85,  86,  87,  88,  89,
+  91,   93,  95,  96,  98, 100, 101, 102,
+  104, 106, 108, 110, 112, 114, 116, 118,
+  122, 124, 126, 128, 130, 132, 134, 136,
+  138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  11,
+  12,   13,  14,  15,  16,  17,  18,  19,
+  20,   21,  22,  23,  24,  25,  26,  27,
+  28,   29,  30,  31,  32,  33,  34,  35,
+  36,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  47,  48,  49,  50,  51,
+  52,   53,  54,  55,  56,  57,  58,  60,
+  62,   64,  66,  68,  70,  72,  74,  76,
+  78,   80,  82,  84,  86,  88,  90,  92,
+  94,   96,  98, 100, 102, 104, 106, 108,
+  110, 112, 114, 116, 119, 122, 125, 128,
+  131, 134, 137, 140, 143, 146, 149, 152,
+  155, 158, 161, 164, 167, 170, 173, 177,
+  181, 185, 189, 193, 197, 201, 205, 209,
+  213, 217, 221, 225, 229, 234, 239, 245,
+  249, 254, 259, 264, 269, 274, 279, 284
+};
+
+//------------------------------------------------------------------------------
+// Paragraph 9.6
+
+void VP8ParseQuant(VP8Decoder* const dec) {
+  VP8BitReader* const br = &dec->br_;
+  const int base_q0 = VP8GetValue(br, 7);
+  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+
+  const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
+  int i;
+
+  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+    int q;
+    if (hdr->use_segment_) {
+      q = hdr->quantizer_[i];
+      if (!hdr->absolute_delta_) {
+        q += base_q0;
+      }
+    } else {
+      if (i > 0) {
+        dec->dqm_[i] = dec->dqm_[0];
+        continue;
+      } else {
+        q = base_q0;
+      }
+    }
+    {
+      VP8QuantMatrix* const m = &dec->dqm_[i];
+      m->y1_mat_[0] = kDcTable[clip(q + dqy1_dc, 127)];
+      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];
+
+      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
+      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
+
+      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
+      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/tree.c
+++ b/3rdparty/libwebp/dec/tree.c
@@ -0,0 +1,589 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Coding trees and probas
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "vp8i.h"
+
+#define USE_GENERIC_TREE
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#ifdef USE_GENERIC_TREE
+static const int8_t kYModesIntra4[18] = {
+  -B_DC_PRED, 1,
+    -B_TM_PRED, 2,
+      -B_VE_PRED, 3,
+        4, 6,
+          -B_HE_PRED, 5,
+            -B_RD_PRED, -B_VR_PRED,
+        -B_LD_PRED, 7,
+          -B_VL_PRED, 8,
+            -B_HD_PRED, -B_HU_PRED
+};
+#endif
+
+#ifndef ONLY_KEYFRAME_CODE
+
+// inter prediction modes
+enum {
+  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
+  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
+
+static const int8_t kYModesInter[8] = {
+  -DC_PRED, 1,
+    2, 3,
+      -V_PRED, -H_PRED,
+      -TM_PRED, -B_PRED
+};
+
+static const int8_t kMBSplit[6] = {
+  -3, 1,
+    -2, 2,
+      -0, -1
+};
+
+static const int8_t kMVRef[8] = {
+  -ZEROMV, 1,
+    -NEARESTMV, 2,
+      -NEARMV, 3,
+        -NEWMV, -SPLITMV
+};
+
+static const int8_t kMVRef4[6] = {
+  -LEFT4, 1,
+    -ABOVE4, 2,
+      -ZERO4, -NEW4
+};
+#endif
+
+//------------------------------------------------------------------------------
+// Default probabilities
+
+// Inter
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
+static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
+static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
+  { 162, 128, 225, 146, 172, 147, 214,  39,
+    156, 128, 129, 132,  75, 145, 178, 206,
+    239, 254, 254 },
+  { 164, 128, 204, 170, 119, 235, 140, 230,
+    228, 128, 130, 130,  74, 148, 180, 203,
+    236, 254, 254 }
+};
+#endif
+
+// Paragraph 13.5
+static const uint8_t
+  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  // genereated using vp8_default_coef_probs() in entropy.c:129
+  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+    },
+    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+    },
+    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+    },
+    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+    },
+    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+    },
+    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+    },
+    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+    },
+    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+    },
+    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+    },
+    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+    },
+    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+    },
+    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+    },
+    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }
+};
+
+// Paragraph 11.5
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+void VP8ResetProba(VP8Proba* const proba) {
+  memset(proba->segments_, 255u, sizeof(proba->segments_));
+  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
+#ifndef ONLY_KEYFRAME_CODE
+  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
+  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
+  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
+#endif
+}
+
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
+  uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
+  uint8_t* const left = dec->intra_l_;
+  // Hardcoded 16x16 intra-mode decision tree.
+  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!dec->is_i4x4_) {
+    const int ymode =
+        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
+                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+    dec->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(top[0]));
+    memset(left, ymode, 4 * sizeof(left[0]));
+  } else {
+    uint8_t* modes = dec->imodes_;
+    int y;
+    for (y = 0; y < 4; ++y) {
+      int ymode = left[y];
+      int x;
+      for (x = 0; x < 4; ++x) {
+        const uint8_t* const prob = kBModesProba[top[x]][ymode];
+#ifdef USE_GENERIC_TREE
+        // Generic tree-parsing
+        int i = 0;
+        do {
+          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+        } while (i > 0);
+        ymode = -i;
+#else
+        // Hardcoded tree parsing
+        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
+                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
+                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
+                      !VP8GetBit(br, prob[3]) ?
+                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
+                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
+                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
+                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
+                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+#endif    // USE_GENERIC_TREE
+        top[x] = ymode;
+        *modes++ = ymode;
+      }
+      left[y] = ymode;
+    }
+  }
+  // Hardcoded UVMode decision tree
+  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+               : !VP8GetBit(br, 114) ? V_PRED
+               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 13
+
+static const uint8_t
+    CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  }
+};
+
+#ifndef ONLY_KEYFRAME_CODE
+static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
+  { 237, 246, 253, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 250, 250,
+    252, 254, 254 },
+  { 231, 243, 245, 253, 254, 254, 254, 254,
+    254, 254, 254, 254, 254, 254, 251, 251,
+    254, 254, 254 }
+};
+#endif
+
+// Paragraph 9.9
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
+  VP8Proba* const proba = &dec->proba_;
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
+            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
+          }
+        }
+      }
+    }
+  }
+  dec->use_skip_proba_ = VP8Get(br);
+  if (dec->use_skip_proba_) {
+    dec->skip_p_ = VP8GetValue(br, 8);
+  }
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->frm_hdr_.key_frame_) {
+    int i;
+    dec->intra_p_ = VP8GetValue(br, 8);
+    dec->last_p_ = VP8GetValue(br, 8);
+    dec->golden_p_ = VP8GetValue(br, 8);
+    if (VP8Get(br)) {   // update y-mode
+      for (i = 0; i < 4; ++i) {
+        proba->ymode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    if (VP8Get(br)) {   // update uv-mode
+      for (i = 0; i < 3; ++i) {
+        proba->uvmode_[i] = VP8GetValue(br, 8);
+      }
+    }
+    // update MV
+    for (i = 0; i < 2; ++i) {
+      int k;
+      for (k = 0; k < NUM_MV_PROBAS; ++k) {
+        if (VP8GetBit(br, MVUpdateProba[i][k])) {
+          const int v = VP8GetValue(br, 7);
+          proba->mv_[i][k] = v ? v << 1 : 1;
+        }
+      }
+    }
+  }
+#endif
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/vp8.c
+++ b/3rdparty/libwebp/dec/vp8.c
@@ -0,0 +1,782 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// main entry for the decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+
+int WebPGetDecoderVersion(void) {
+  return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// VP8Decoder
+
+static void SetOk(VP8Decoder* const dec) {
+  dec->status_ = VP8_STATUS_OK;
+  dec->error_msg_ = "OK";
+}
+
+int VP8InitIoInternal(VP8Io* const io, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // mismatch error
+  }
+  if (io != NULL) {
+    memset(io, 0, sizeof(*io));
+  }
+  return 1;
+}
+
+VP8Decoder* VP8New(void) {
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1, sizeof(*dec));
+  if (dec != NULL) {
+    SetOk(dec);
+    WebPWorkerInit(&dec->worker_);
+    dec->ready_ = 0;
+    dec->num_parts_ = 1;
+  }
+  return dec;
+}
+
+VP8StatusCode VP8Status(VP8Decoder* const dec) {
+  if (!dec) return VP8_STATUS_INVALID_PARAM;
+  return dec->status_;
+}
+
+const char* VP8StatusMessage(VP8Decoder* const dec) {
+  if (dec == NULL) return "no object";
+  if (!dec->error_msg_) return "OK";
+  return dec->error_msg_;
+}
+
+void VP8Delete(VP8Decoder* const dec) {
+  if (dec != NULL) {
+    VP8Clear(dec);
+    free(dec);
+  }
+}
+
+int VP8SetError(VP8Decoder* const dec,
+                VP8StatusCode error, const char* const msg) {
+  // TODO This check would be unnecessary if alpha decompression was separated
+  // from VP8ProcessRow/FinishRow. This avoids setting 'dec->status_' to
+  // something other than VP8_STATUS_BITSTREAM_ERROR on alpha decompression
+  // failure.
+  if (dec->status_ == VP8_STATUS_OK) {
+    dec->status_ = error;
+    dec->error_msg_ = msg;
+    dec->ready_ = 0;
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+
+int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
+  return (data_size >= 3 &&
+          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
+}
+
+int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
+               int* const width, int* const height) {
+  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
+    return 0;         // not enough data
+  }
+  // check signature
+  if (!VP8CheckSignature(data + 3, data_size - 3)) {
+    return 0;         // Wrong signature.
+  } else {
+    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
+    const int key_frame = !(bits & 1);
+    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
+    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
+
+    if (!key_frame) {   // Not a keyframe.
+      return 0;
+    }
+
+    if (((bits >> 1) & 7) > 3) {
+      return 0;         // unknown profile
+    }
+    if (!((bits >> 4) & 1)) {
+      return 0;         // first frame is invisible!
+    }
+    if (((bits >> 5)) >= chunk_size) {  // partition_length
+      return 0;         // inconsistent size information.
+    }
+
+    if (width) {
+      *width = w;
+    }
+    if (height) {
+      *height = h;
+    }
+
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Header parsing
+
+static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
+  assert(hdr != NULL);
+  hdr->use_segment_ = 0;
+  hdr->update_map_ = 0;
+  hdr->absolute_delta_ = 1;
+  memset(hdr->quantizer_, 0, sizeof(hdr->quantizer_));
+  memset(hdr->filter_strength_, 0, sizeof(hdr->filter_strength_));
+}
+
+// Paragraph 9.3
+static int ParseSegmentHeader(VP8BitReader* br,
+                              VP8SegmentHeader* hdr, VP8Proba* proba) {
+  assert(br != NULL);
+  assert(hdr != NULL);
+  hdr->use_segment_ = VP8Get(br);
+  if (hdr->use_segment_) {
+    hdr->update_map_ = VP8Get(br);
+    if (VP8Get(br)) {   // update data
+      int s;
+      hdr->absolute_delta_ = VP8Get(br);
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+      }
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+      }
+    }
+    if (hdr->update_map_) {
+      int s;
+      for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
+        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+      }
+    }
+  } else {
+    hdr->update_map_ = 0;
+  }
+  return !br->eof_;
+}
+
+// Paragraph 9.5
+// This function returns VP8_STATUS_SUSPENDED if we don't have all the
+// necessary data in 'buf'.
+// This case is not necessarily an error (for incremental decoding).
+// Still, no bitreader is ever initialized to make it possible to read
+// unavailable memory.
+// If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
+// is returned, and this is an unrecoverable error.
+// If the partitions were positioned ok, VP8_STATUS_OK is returned.
+static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
+                                     const uint8_t* buf, size_t size) {
+  VP8BitReader* const br = &dec->br_;
+  const uint8_t* sz = buf;
+  const uint8_t* buf_end = buf + size;
+  const uint8_t* part_start;
+  int last_part;
+  int p;
+
+  dec->num_parts_ = 1 << VP8GetValue(br, 2);
+  last_part = dec->num_parts_ - 1;
+  part_start = buf + last_part * 3;
+  if (buf_end < part_start) {
+    // we can't even read the sizes with sz[]! That's a failure.
+    return VP8_STATUS_NOT_ENOUGH_DATA;
+  }
+  for (p = 0; p < last_part; ++p) {
+    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    const uint8_t* part_end = part_start + psize;
+    if (part_end > buf_end) part_end = buf_end;
+    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
+    part_start = part_end;
+    sz += 3;
+  }
+  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
+  return (part_start < buf_end) ? VP8_STATUS_OK :
+           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
+}
+
+// Paragraph 9.4
+static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
+  VP8FilterHeader* const hdr = &dec->filter_hdr_;
+  hdr->simple_    = VP8Get(br);
+  hdr->level_     = VP8GetValue(br, 6);
+  hdr->sharpness_ = VP8GetValue(br, 3);
+  hdr->use_lf_delta_ = VP8Get(br);
+  if (hdr->use_lf_delta_) {
+    if (VP8Get(br)) {   // update lf-delta?
+      int i;
+      for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+      for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+    }
+  }
+  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
+  return !br->eof_;
+}
+
+// Topmost call
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
+  const uint8_t* buf;
+  size_t buf_size;
+  VP8FrameHeader* frm_hdr;
+  VP8PictureHeader* pic_hdr;
+  VP8BitReader* br;
+  VP8StatusCode status;
+  WebPHeaderStructure headers;
+
+  if (dec == NULL) {
+    return 0;
+  }
+  SetOk(dec);
+  if (io == NULL) {
+    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
+                       "null VP8Io passed to VP8GetHeaders()");
+  }
+
+  // Process Pre-VP8 chunks.
+  headers.data = io->data;
+  headers.data_size = io->data_size;
+  status = WebPParseHeaders(&headers);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "Incorrect/incomplete header.");
+  }
+  if (headers.is_lossless) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "Unexpected lossless format encountered.");
+  }
+
+  if (dec->alpha_data_ == NULL) {
+    assert(dec->alpha_data_size_ == 0);
+    // We have NOT set alpha data yet. Set it now.
+    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
+    // WebPParseHeaders() is called more than once, as in incremental decoding
+    // case.)
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+  }
+
+  // Process the VP8 frame header.
+  buf = headers.data + headers.offset;
+  buf_size = headers.data_size - headers.offset;
+  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
+  if (buf_size < 4) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "Truncated header.");
+  }
+
+  // Paragraph 9.1
+  {
+    const uint32_t bits = buf[0] | (buf[1] << 8) | (buf[2] << 16);
+    frm_hdr = &dec->frm_hdr_;
+    frm_hdr->key_frame_ = !(bits & 1);
+    frm_hdr->profile_ = (bits >> 1) & 7;
+    frm_hdr->show_ = (bits >> 4) & 1;
+    frm_hdr->partition_length_ = (bits >> 5);
+    if (frm_hdr->profile_ > 3)
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "Incorrect keyframe parameters.");
+    if (!frm_hdr->show_)
+      return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
+                         "Frame not displayable.");
+    buf += 3;
+    buf_size -= 3;
+  }
+
+  pic_hdr = &dec->pic_hdr_;
+  if (frm_hdr->key_frame_) {
+    // Paragraph 9.2
+    if (buf_size < 7) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "cannot parse picture header");
+    }
+    if (!VP8CheckSignature(buf, buf_size)) {
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "Bad code word");
+    }
+    pic_hdr->width_ = ((buf[4] << 8) | buf[3]) & 0x3fff;
+    pic_hdr->xscale_ = buf[4] >> 6;   // ratio: 1, 5/4 5/3 or 2
+    pic_hdr->height_ = ((buf[6] << 8) | buf[5]) & 0x3fff;
+    pic_hdr->yscale_ = buf[6] >> 6;
+    buf += 7;
+    buf_size -= 7;
+
+    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
+    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
+    io->width = pic_hdr->width_;
+    io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto
+
+    VP8ResetProba(&dec->proba_);
+    ResetSegmentHeader(&dec->segment_hdr_);
+    dec->segment_ = 0;    // default for intra
+  }
+
+  // Check if we have all the partition #0 available, and initialize dec->br_
+  // to read this partition (and this partition only).
+  if (frm_hdr->partition_length_ > buf_size) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "bad partition length");
+  }
+
+  br = &dec->br_;
+  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
+  buf += frm_hdr->partition_length_;
+  buf_size -= frm_hdr->partition_length_;
+
+  if (frm_hdr->key_frame_) {
+    pic_hdr->colorspace_ = VP8Get(br);
+    pic_hdr->clamp_type_ = VP8Get(br);
+  }
+  if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "cannot parse segment header");
+  }
+  // Filter specs
+  if (!ParseFilterHeader(br, dec)) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "cannot parse filter header");
+  }
+  status = ParsePartitions(dec, buf, buf_size);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "cannot parse partitions");
+  }
+
+  // quantizer change
+  VP8ParseQuant(dec);
+
+  // Frame buffer marking
+  if (!frm_hdr->key_frame_) {
+    // Paragraph 9.7
+#ifndef ONLY_KEYFRAME_CODE
+    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
+    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
+    if (!(dec->buffer_flags_ & 1)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
+    }
+    if (!(dec->buffer_flags_ & 2)) {
+      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
+    }
+    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
+    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
+#else
+    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
+                       "Not a key frame.");
+#endif
+  } else {
+    dec->buffer_flags_ = 0x003 | 0x100;
+  }
+
+  // Paragraph 9.8
+#ifndef ONLY_KEYFRAME_CODE
+  dec->update_proba_ = VP8Get(br);
+  if (!dec->update_proba_) {    // save for later restore
+    dec->proba_saved_ = dec->proba_;
+  }
+  dec->buffer_flags_ &= 1 << 8;
+  dec->buffer_flags_ |=
+      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
+#else
+  VP8Get(br);   // just ignore the value of update_proba_
+#endif
+
+  VP8ParseProba(br, dec);
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  // Extensions
+  if (dec->pic_hdr_.colorspace_) {
+    const size_t kTrailerSize = 8;
+    const uint8_t kTrailerMarker = 0x01;
+    const uint8_t* ext_buf = buf - kTrailerSize;
+    size_t size;
+
+    if (frm_hdr->partition_length_ < kTrailerSize ||
+        ext_buf[kTrailerSize - 1] != kTrailerMarker) {
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "RIFF: Inconsistent extra information.");
+    }
+
+    // Layer
+    size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16);
+    dec->layer_data_size_ = size;
+    dec->layer_data_ = NULL;  // will be set later
+    dec->layer_colorspace_ = ext_buf[3];
+  }
+#endif
+
+  // sanitized state
+  dec->ready_ = 1;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Residual decoding (Paragraph 13.2 / 13.3)
+
+static const int kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
+};
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+typedef const uint8_t (*ProbaCtxArray)[NUM_PROBAS];
+
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
+  if (!VP8GetBit(br, p[3])) {
+    if (!VP8GetBit(br, p[4])) {
+      v = 2;
+    } else {
+      v = 3 + VP8GetBit(br, p[5]);
+    }
+  } else {
+    if (!VP8GetBit(br, p[6])) {
+      if (!VP8GetBit(br, p[7])) {
+        v = 5 + VP8GetBit(br, 159);
+      } else {
+        v = 7 + 2 * VP8GetBit(br, 165);
+        v += VP8GetBit(br, 145);
+      }
+    } else {
+      const uint8_t* tab;
+      const int bit1 = VP8GetBit(br, p[8]);
+      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int cat = 2 * bit1 + bit0;
+      v = 0;
+      for (tab = kCat3456[cat]; *tab; ++tab) {
+        v += v + VP8GetBit(br, *tab);
+      }
+      v += 3 + (8 << cat);
+    }
+  }
+  return v;
+}
+
+// Returns the position of the last non-zero coeff plus one
+// (and 0 if there's no coeff at all)
+static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
+  const uint8_t* p = prob[n][ctx];
+  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
+    return 0;
+  }
+  for (; n < 16; ++n) {
+    const ProbaCtxArray p_ctx = prob[kBands[n + 1]];
+    if (!VP8GetBit(br, p[1])) {
+      p = p_ctx[0];
+    } else {  // non zero coeff
+      int v;
+      if (!VP8GetBit(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+      if (n < 15 && !VP8GetBit(br, p[0])) {   // EOB
+        return n + 1;
+      }
+    }
+  }
+  return 16;
+}
+
+// Alias-safe way of converting 4bytes to 32bits.
+typedef union {
+  uint8_t  i8[4];
+  uint32_t i32;
+} PackedNz;
+
+// Table to unpack four bits into four bytes
+static const PackedNz kUnpackTab[16] = {
+  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
+  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
+  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
+  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
+
+// Macro to pack four LSB of four bytes into four bits.
+#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
+    defined(__BIG_ENDIAN__)
+#define PACK_CST 0x08040201U
+#else
+#define PACK_CST 0x01020408U
+#endif
+#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
+
+static void ParseResiduals(VP8Decoder* const dec,
+                           VP8MB* const mb, VP8BitReader* const token_br) {
+  int out_t_nz, out_l_nz, first;
+  ProbaArray ac_prob;
+  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
+  int16_t* dst = dec->coeffs_;
+  VP8MB* const left_mb = dec->mb_info_ - 1;
+  PackedNz nz_ac, nz_dc;
+  PackedNz tnz, lnz;
+  uint32_t non_zero_ac = 0;
+  uint32_t non_zero_dc = 0;
+  int x, y, ch;
+
+  nz_dc.i32 = nz_ac.i32 = 0;
+  memset(dst, 0, 384 * sizeof(*dst));
+  if (!dec->is_i4x4_) {    // parse DC
+    int16_t dc[16] = { 0 };
+    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
+    mb->dc_nz_ = left_mb->dc_nz_ =
+        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+                   ctx, q->y2_mat_, 0, dc) > 0);
+    first = 1;
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
+    VP8TransformWHT(dc, dst);
+  } else {
+    first = 0;
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+  }
+
+  tnz = kUnpackTab[mb->nz_ & 0xf];
+  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  for (y = 0; y < 4; ++y) {
+    int l = lnz.i8[y];
+    for (x = 0; x < 4; ++x) {
+      const int ctx = l + tnz.i8[x];
+      const int nz = GetCoeffs(token_br, ac_prob, ctx,
+                               q->y1_mat_, first, dst);
+      tnz.i8[x] = l = (nz > 0);
+      nz_dc.i8[x] = (dst[0] != 0);
+      nz_ac.i8[x] = (nz > 1);
+      dst += 16;
+    }
+    lnz.i8[y] = l;
+    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
+    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+  }
+  out_t_nz = PACK(tnz, 24);
+  out_l_nz = PACK(lnz, 24);
+
+  tnz = kUnpackTab[mb->nz_ >> 4];
+  lnz = kUnpackTab[left_mb->nz_ >> 4];
+  for (ch = 0; ch < 4; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      int l = lnz.i8[ch + y];
+      for (x = 0; x < 2; ++x) {
+        const int ctx = l + tnz.i8[ch + x];
+        const int nz =
+            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
+                      ctx, q->uv_mat_, 0, dst);
+        tnz.i8[ch + x] = l = (nz > 0);
+        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
+        nz_ac.i8[y * 2 + x] = (nz > 1);
+        dst += 16;
+      }
+      lnz.i8[ch + y] = l;
+    }
+    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
+    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+  }
+  out_t_nz |= PACK(tnz, 20);
+  out_l_nz |= PACK(lnz, 20);
+  mb->nz_ = out_t_nz;
+  left_mb->nz_ = out_l_nz;
+
+  dec->non_zero_ac_ = non_zero_ac;
+  dec->non_zero_ = non_zero_ac | non_zero_dc;
+  mb->skip_ = !dec->non_zero_;
+}
+#undef PACK
+
+//------------------------------------------------------------------------------
+// Main loop
+
+int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
+  VP8BitReader* const br = &dec->br_;
+  VP8MB* const left = dec->mb_info_ - 1;
+  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+
+  // Note: we don't save segment map (yet), as we don't expect
+  // to decode more than 1 keyframe.
+  if (dec->segment_hdr_.update_map_) {
+    // Hardcoded tree parsing
+    dec->segment_ = !VP8GetBit(br, dec->proba_.segments_[0]) ?
+        VP8GetBit(br, dec->proba_.segments_[1]) :
+        2 + VP8GetBit(br, dec->proba_.segments_[2]);
+  }
+  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+
+  VP8ParseIntraMode(br, dec);
+  if (br->eof_) {
+    return 0;
+  }
+
+  if (!info->skip_) {
+    ParseResiduals(dec, info, token_br);
+  } else {
+    left->nz_ = info->nz_ = 0;
+    if (!dec->is_i4x4_) {
+      left->dc_nz_ = info->dc_nz_ = 0;
+    }
+    dec->non_zero_ = 0;
+    dec->non_zero_ac_ = 0;
+  }
+
+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
+    finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+  }
+
+  return (!token_br->eof_);
+}
+
+void VP8InitScanline(VP8Decoder* const dec) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  left->nz_ = 0;
+  left->dc_nz_ = 0;
+  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+  dec->filter_row_ =
+    (dec->filter_type_ > 0) &&
+    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+}
+
+static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    VP8BitReader* const token_br =
+        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    VP8InitScanline(dec);
+    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+      if (!VP8DecodeMB(dec, token_br)) {
+        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                           "Premature end-of-file encountered.");
+      }
+      // Reconstruct and emit samples.
+      VP8ReconstructBlock(dec);
+    }
+    if (!VP8ProcessRow(dec, io)) {
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
+    }
+  }
+  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
+    return 0;
+  }
+
+  // Finish
+#ifndef ONLY_KEYFRAME_CODE
+  if (!dec->update_proba_) {
+    dec->proba_ = dec->proba_saved_;
+  }
+#endif
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (dec->layer_data_size_ > 0) {
+    if (!VP8DecodeLayer(dec)) {
+      return 0;
+    }
+  }
+#endif
+
+  return 1;
+}
+
+// Main entry point
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 0;
+  if (dec == NULL) {
+    return 0;
+  }
+  if (io == NULL) {
+    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
+                       "NULL VP8Io parameter in VP8Decode().");
+  }
+
+  if (!dec->ready_) {
+    if (!VP8GetHeaders(dec, io)) {
+      return 0;
+    }
+  }
+  assert(dec->ready_);
+
+  // Finish setting up the decoding parameter. Will call io->setup().
+  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
+  if (ok) {   // good to go.
+    // Will allocate memory and prepare everything.
+    if (ok) ok = VP8InitFrame(dec, io);
+
+    // Main decoding loop
+    if (ok) ok = ParseFrame(dec, io);
+
+    // Exit.
+    ok &= VP8ExitCritical(dec, io);
+  }
+
+  if (!ok) {
+    VP8Clear(dec);
+    return 0;
+  }
+
+  dec->ready_ = 0;
+  return ok;
+}
+
+void VP8Clear(VP8Decoder* const dec) {
+  if (dec == NULL) {
+    return;
+  }
+  if (dec->use_threads_) {
+    WebPWorkerEnd(&dec->worker_);
+  }
+  if (dec->mem_) {
+    free(dec->mem_);
+  }
+  dec->mem_ = NULL;
+  dec->mem_size_ = 0;
+  memset(&dec->br_, 0, sizeof(dec->br_));
+  dec->ready_ = 0;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/vp8i.h
+++ b/3rdparty/libwebp/dec/vp8i.h
@@ -0,0 +1,333 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// VP8 decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DEC_VP8I_H_
+#define WEBP_DEC_VP8I_H_
+
+#include <string.h>     // for memcpy()
+#include "./vp8li.h"
+#include "../utils/bit_reader.h"
+#include "../utils/thread.h"
+#include "../dsp/dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Various defines and enums
+
+// version numbers
+#define DEC_MAJ_VERSION 0
+#define DEC_MIN_VERSION 3
+#define DEC_REV_VERSION 0
+
+#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
+
+// intra prediction modes
+enum { B_DC_PRED = 0,   // 4x4 modes
+       B_TM_PRED,
+       B_VE_PRED,
+       B_HE_PRED,
+       B_RD_PRED,
+       B_VR_PRED,
+       B_LD_PRED,
+       B_VL_PRED,
+       B_HD_PRED,
+       B_HU_PRED,
+       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
+
+       // Luma16 or UV modes
+       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       B_PRED = NUM_BMODES,   // refined I4x4 mode
+
+       // special modes
+       B_DC_PRED_NOTOP = 4,
+       B_DC_PRED_NOLEFT = 5,
+       B_DC_PRED_NOTOPLEFT = 6,
+       NUM_B_DC_MODES = 7 };
+
+enum { MB_FEATURE_TREE_PROBS = 3,
+       NUM_MB_SEGMENTS = 4,
+       NUM_REF_LF_DELTAS = 4,
+       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
+       MAX_NUM_PARTITIONS = 8,
+       // Probabilities
+       NUM_TYPES = 4,
+       NUM_BANDS = 8,
+       NUM_CTX = 3,
+       NUM_PROBAS = 11,
+       NUM_MV_PROBAS = 19 };
+
+// YUV-cache parameters.
+// Constraints are: We need to store one 16x16 block of luma samples (y),
+// and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
+// in order to be SIMD-friendly. We also need to store the top, left and
+// top-left samples (from previously decoded blocks), along with four
+// extra top-right samples for luma (intra4x4 prediction only).
+// One possible layout is, using 32 * (17 + 9) bytes:
+//
+//   .+------   <- only 1 pixel high
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyy..
+//   .+--.+--   <- only 1 pixel high
+//   .|uu.|vv
+//   .|uu.|vv
+//
+// Every character is a 4x4 block, with legend:
+//  '.' = unused
+//  'y' = y-samples   'u' = u-samples     'v' = u-samples
+//  '|' = left sample,   '-' = top sample,    '+' = top-left sample
+//  't' = extra top-right sample for 4x4 modes
+// With this layout, BPS (=Bytes Per Scan-line) is one cacheline size.
+#define BPS       32    // this is the common stride used by yuv[]
+#define YUV_SIZE (BPS * 17 + BPS * 9)
+#define Y_SIZE   (BPS * 17)
+#define Y_OFF    (BPS * 1 + 8)
+#define U_OFF    (Y_OFF + BPS * 16 + BPS)
+#define V_OFF    (U_OFF + 16)
+
+//------------------------------------------------------------------------------
+// Headers
+
+typedef struct {
+  uint8_t key_frame_;
+  uint8_t profile_;
+  uint8_t show_;
+  uint32_t partition_length_;
+} VP8FrameHeader;
+
+typedef struct {
+  uint16_t width_;
+  uint16_t height_;
+  uint8_t xscale_;
+  uint8_t yscale_;
+  uint8_t colorspace_;   // 0 = YCbCr
+  uint8_t clamp_type_;
+} VP8PictureHeader;
+
+// segment features
+typedef struct {
+  int use_segment_;
+  int update_map_;        // whether to update the segment map or not
+  int absolute_delta_;    // absolute or delta values for quantizer and filter
+  int8_t quantizer_[NUM_MB_SEGMENTS];        // quantization changes
+  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
+} VP8SegmentHeader;
+
+// Struct collecting all frame-persistent probabilities.
+typedef struct {
+  uint8_t segments_[MB_FEATURE_TREE_PROBS];
+  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
+  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t ymode_[4], uvmode_[3];
+  uint8_t mv_[2][NUM_MV_PROBAS];
+#endif
+} VP8Proba;
+
+// Filter parameters
+typedef struct {
+  int simple_;                  // 0=complex, 1=simple
+  int level_;                   // [0..63]
+  int sharpness_;               // [0..7]
+  int use_lf_delta_;
+  int ref_lf_delta_[NUM_REF_LF_DELTAS];
+  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
+} VP8FilterHeader;
+
+//------------------------------------------------------------------------------
+// Informations about the macroblocks.
+
+typedef struct {  // filter specs
+  unsigned int f_level_:6;      // filter strength: 0..63
+  unsigned int f_ilevel_:6;     // inner limit: 1..63
+  unsigned int f_inner_:1;      // do inner filtering?
+} VP8FInfo;
+
+typedef struct {  // used for syntax-parsing
+  unsigned int nz_:24;       // non-zero AC/DC coeffs (24bit)
+  unsigned int dc_nz_:1;     // non-zero DC coeffs
+  unsigned int skip_:1;      // block type
+} VP8MB;
+
+// Dequantization matrices
+typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
+typedef struct {
+  quant_t y1_mat_, y2_mat_, uv_mat_;
+} VP8QuantMatrix;
+
+// Persistent information needed by the parallel processing
+typedef struct {
+  int id_;            // cache row to process (in [0..2])
+  int mb_y_;          // macroblock position of the row
+  int filter_row_;    // true if row-filtering is needed
+  VP8FInfo* f_info_;  // filter strengths
+  VP8Io io_;          // copy of the VP8Io to pass to put()
+} VP8ThreadContext;
+
+//------------------------------------------------------------------------------
+// VP8Decoder: the main opaque structure handed over to user
+
+struct VP8Decoder {
+  VP8StatusCode status_;
+  int ready_;     // true if ready to decode a picture with VP8Decode()
+  const char* error_msg_;  // set when status_ is not OK.
+
+  // Main data source
+  VP8BitReader br_;
+
+  // headers
+  VP8FrameHeader   frm_hdr_;
+  VP8PictureHeader pic_hdr_;
+  VP8FilterHeader  filter_hdr_;
+  VP8SegmentHeader segment_hdr_;
+
+  // Worker
+  WebPWorker worker_;
+  int use_threads_;    // use multi-thread
+  int cache_id_;       // current cache row
+  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
+  VP8ThreadContext thread_ctx_;  // Thread context
+
+  // dimension, in macroblock units.
+  int mb_w_, mb_h_;
+
+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
+  // number of partitions.
+  int num_parts_;
+  // per-partition boolean decoders.
+  VP8BitReader parts_[MAX_NUM_PARTITIONS];
+
+  // buffer refresh flags
+  //   bit 0: refresh Gold, bit 1: refresh Alt
+  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
+  //   bit 6: Gold sign bias, bit 7: Alt sign bias
+  //   bit 8: refresh last frame
+  uint32_t buffer_flags_;
+
+  // dequantization (one set of DC/AC dequant factor per segment)
+  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
+
+  // probabilities
+  VP8Proba proba_;
+  int use_skip_proba_;
+  uint8_t skip_p_;
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t intra_p_, last_p_, golden_p_;
+  VP8Proba proba_saved_;
+  int update_proba_;
+#endif
+
+  // Boundary data cache and persistent buffers.
+  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];  // left intra modes values
+  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
+  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+
+  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;     // filter strength info
+  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
+  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
+
+  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
+  uint8_t* cache_u_;
+  uint8_t* cache_v_;
+  int cache_y_stride_;
+  int cache_uv_stride_;
+
+  // main memory chunk for the above data. Persistent.
+  void* mem_;
+  size_t mem_size_;
+
+  // Per macroblock non-persistent infos.
+  int mb_x_, mb_y_;       // current position, in macroblock units
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  uint8_t segment_;       // block's segment
+
+  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
+  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
+  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
+  // If the bit is set, the 4x4 block contains some non-zero coefficients.
+  uint32_t non_zero_;
+  uint32_t non_zero_ac_;
+
+  // Filtering side-info
+  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  int filter_row_;                           // per-row flag
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
+
+  // extensions
+  const uint8_t* alpha_data_;   // compressed alpha data (if present)
+  size_t alpha_data_size_;
+  uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
+
+  int layer_colorspace_;
+  const uint8_t* layer_data_;   // compressed layer data (if present)
+  size_t layer_data_size_;
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8.c
+int VP8SetError(VP8Decoder* const dec,
+                VP8StatusCode error, const char* const msg);
+
+// in tree.c
+void VP8ResetProba(VP8Proba* const proba);
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
+void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec);
+
+// in quant.c
+void VP8ParseQuant(VP8Decoder* const dec);
+
+// in frame.c
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+// Predict a block and add residual
+void VP8ReconstructBlock(VP8Decoder* const dec);
+// Call io->setup() and finish setting up scan parameters.
+// After this call returns, one must always call VP8ExitCritical() with the
+// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
+// if ok, otherwise sets and returns the error status on *dec.
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
+// Must always be called in pair with VP8EnterCritical().
+// Returns false in case of error.
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+// Process the last decoded row (filtering + output)
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+// To be called at the start of a new scanline, to initialize predictors.
+void VP8InitScanline(VP8Decoder* const dec);
+// Decode one macroblock. Returns false if there is not enough data.
+int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
+
+// in alpha.c
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows);
+
+// in layer.c
+int VP8DecodeLayer(VP8Decoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8I_H_ */
--- a/3rdparty/libwebp/dec/vp8l.c
+++ b/3rdparty/libwebp/dec/vp8l.c
--- a/3rdparty/libwebp/dec/vp8li.h
+++ b/3rdparty/libwebp/dec/vp8li.h
@@ -0,0 +1,121 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Lossless decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora(vikaas.arora@gmail.com)
+
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_
+
+#include <string.h>     // for memcpy()
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+#include "../utils/color_cache.h"
+#include "../utils/huffman.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+typedef enum {
+  READ_DATA = 0,
+  READ_HDR = 1,
+  READ_DIM = 2
+} VP8LDecodeState;
+
+typedef struct VP8LTransform VP8LTransform;
+struct VP8LTransform {
+  VP8LImageTransformType type_;   // transform type.
+  int                    bits_;   // subsampling bits defining transform window.
+  int                    xsize_;  // transform window X index.
+  int                    ysize_;  // transform window Y index.
+  uint32_t              *data_;   // transform data.
+};
+
+typedef struct {
+  HuffmanTree htrees_[HUFFMAN_CODES_PER_META_CODE];
+} HTreeGroup;
+
+typedef struct {
+  int             color_cache_size_;
+  VP8LColorCache  color_cache_;
+
+  int             huffman_mask_;
+  int             huffman_subsample_bits_;
+  int             huffman_xsize_;
+  uint32_t       *huffman_image_;
+  int             num_htree_groups_;
+  HTreeGroup     *htree_groups_;
+} VP8LMetadata;
+
+typedef struct {
+  VP8StatusCode    status_;
+  VP8LDecodeState  action_;
+  VP8LDecodeState  state_;
+  VP8Io           *io_;
+
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *argb_;          // Internal data: always in BGRA color mode.
+  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+
+  VP8LBitReader    br_;
+
+  int              width_;
+  int              height_;
+  int              last_row_;      // last input row decoded so far.
+  int              last_out_row_;  // last row output so far.
+
+  VP8LMetadata     hdr_;
+
+  int              next_transform_;
+  VP8LTransform    transforms_[NUM_TRANSFORMS];
+  // or'd bitset storing the transforms types.
+  uint32_t         transforms_seen_;
+
+  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+} VP8LDecoder;
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8l.c
+
+// Decodes a raw image stream (without header) and store the alpha data
+// into *output, which must be of size width x height. Returns false in case
+// of error.
+int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
+                               size_t data_size, uint8_t* const output);
+
+// Allocates and initialize a new lossless decoder instance.
+VP8LDecoder* VP8LNew(void);
+
+// Decodes the image header. Returns false in case of error.
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+
+// Decodes an image. It's required to decode the lossless header before calling
+// this function. Returns false in case of error, with updated dec->status_.
+int VP8LDecodeImage(VP8LDecoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+void VP8LClear(VP8LDecoder* const dec);
+
+// Clears and deallocate a lossless decoder instance.
+void VP8LDelete(VP8LDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8LI_H_ */
--- a/3rdparty/libwebp/dec/webp.c
+++ b/3rdparty/libwebp/dec/webp.c
@@ -0,0 +1,783 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Main decoding functions for WEBP images.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../webp/mux_types.h"  // ALPHA_FLAG
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// RIFF layout is:
+//   Offset  tag
+//   0...3   "RIFF" 4-byte tag
+//   4...7   size of image data (including metadata) starting at offset 8
+//   8...11  "WEBP"   our form-type signature
+// The RIFF container (12 bytes) is followed by appropriate chunks:
+//   12..15  "VP8 ": 4-bytes tags, signaling the use of VP8 video format
+//   16..19  size of the raw VP8 image data, starting at offset 20
+//   20....  the VP8 bytes
+// Or,
+//   12..15  "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
+//   16..19  size of the raw VP8L image data, starting at offset 20
+//   20....  the VP8L bytes
+// Or,
+//   12..15  "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
+//   16..19  size of the VP8X chunk starting at offset 20.
+//   20..23  VP8X flags bit-map corresponding to the chunk-types present.
+//   24..26  Width of the Canvas Image.
+//   27..29  Height of the Canvas Image.
+// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
+// VP8L, XMP, EXIF  ...)
+// All sizes are in little-endian order.
+// Note: chunk data size must be padded to multiple of 2 when written.
+
+static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16);
+}
+
+static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
+  return (uint32_t)get_le24(data) | (data[3] << 24);
+}
+
+// Validates the RIFF container (if detected) and skips over it.
+// If a RIFF container is detected,
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and
+//         VP8_STATUS_OK otherwise.
+// In case there are not enough bytes (partial RIFF container), return 0 for
+// *riff_size. Else return the RIFF size extracted from the header.
+static VP8StatusCode ParseRIFF(const uint8_t** const data,
+                               size_t* const data_size,
+                               size_t* const riff_size) {
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(riff_size != NULL);
+
+  *riff_size = 0;  // Default: no RIFF present.
+  if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
+    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
+    } else {
+      const uint32_t size = get_le32(*data + TAG_SIZE);
+      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
+      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      if (size > MAX_CHUNK_PAYLOAD) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      // We have a RIFF container. Skip it.
+      *riff_size = size;
+      *data += RIFF_HEADER_SIZE;
+      *data_size -= RIFF_HEADER_SIZE;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+// Validates the VP8X header and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
+// *height_ptr and *flags_ptr are set to the corresponding values extracted
+// from the VP8X chunk.
+static VP8StatusCode ParseVP8X(const uint8_t** const data,
+                               size_t* const data_size,
+                               int* const found_vp8x,
+                               int* const width_ptr, int* const height_ptr,
+                               uint32_t* const flags_ptr) {
+  const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(found_vp8x != NULL);
+
+  *found_vp8x = 0;
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
+    int width, height;
+    uint32_t flags;
+    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    if (chunk_size != VP8X_CHUNK_SIZE) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
+    }
+
+    // Verify if enough data is available to validate the VP8X chunk.
+    if (*data_size < vp8x_size) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+    }
+    flags = get_le32(*data + 8);
+    width = 1 + get_le24(*data + 12);
+    height = 1 + get_le24(*data + 15);
+    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
+    }
+
+    if (flags_ptr != NULL) *flags_ptr = flags;
+    if (width_ptr != NULL) *width_ptr = width;
+    if (height_ptr != NULL) *height_ptr = height;
+    // Skip over VP8X header bytes.
+    *data += vp8x_size;
+    *data_size -= vp8x_size;
+    *found_vp8x = 1;
+  }
+  return VP8_STATUS_OK;
+}
+
+// Skips to the next VP8/VP8L chunk header in the data given the size of the
+// RIFF chunk 'riff_size'.
+// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If an alpha chunk is found, *alpha_data and *alpha_size are set
+// appropriately.
+static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
+                                         size_t* const data_size,
+                                         size_t const riff_size,
+                                         const uint8_t** const alpha_data,
+                                         size_t* const alpha_size) {
+  const uint8_t* buf;
+  size_t buf_size;
+  uint32_t total_size = TAG_SIZE +           // "WEBP".
+                        CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
+                        VP8X_CHUNK_SIZE;     // data.
+  assert(data != NULL);
+  assert(data_size != NULL);
+  buf = *data;
+  buf_size = *data_size;
+
+  assert(alpha_data != NULL);
+  assert(alpha_size != NULL);
+  *alpha_data = NULL;
+  *alpha_size = 0;
+
+  while (1) {
+    uint32_t chunk_size;
+    uint32_t disk_chunk_size;   // chunk_size with padding
+
+    *data = buf;
+    *data_size = buf_size;
+
+    if (buf_size < CHUNK_HEADER_SIZE) {  // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    chunk_size = get_le32(buf + TAG_SIZE);
+    if (chunk_size > MAX_CHUNK_PAYLOAD) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+    // For odd-sized chunk-payload, there's one byte padding at the end.
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    total_size += disk_chunk_size;
+
+    // Check that total bytes skipped so far does not exceed riff_size.
+    if (riff_size > 0 && (total_size > riff_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+
+    if (buf_size < disk_chunk_size) {             // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
+      *alpha_data = buf + CHUNK_HEADER_SIZE;
+      *alpha_size = chunk_size;
+    } else if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+               !memcmp(buf, "VP8L", TAG_SIZE)) {  // A valid VP8/VP8L header.
+      return VP8_STATUS_OK;  // Found.
+    }
+
+    // We have a full and valid chunk; skip it.
+    buf += disk_chunk_size;
+    buf_size -= disk_chunk_size;
+  }
+}
+
+// Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
+//         riff_size) VP8/VP8L header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
+// extracted from the VP8/VP8L chunk header.
+// The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
+static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
+                                    size_t* const data_size,
+                                    size_t riff_size,
+                                    size_t* const chunk_size,
+                                    int* const is_lossless) {
+  const uint8_t* const data = *data_ptr;
+  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
+  const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
+  const uint32_t minimal_size =
+      TAG_SIZE + CHUNK_HEADER_SIZE;  // "WEBP" + "VP8 nnnn" OR
+                                     // "WEBP" + "VP8Lnnnn"
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(chunk_size != NULL);
+  assert(is_lossless != NULL);
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (is_vp8 || is_vp8l) {
+    // Bitstream contains VP8/VP8L header.
+    const uint32_t size = get_le32(data + TAG_SIZE);
+    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
+    }
+    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
+    *chunk_size = size;
+    *data_ptr += CHUNK_HEADER_SIZE;
+    *data_size -= CHUNK_HEADER_SIZE;
+    *is_lossless = is_vp8l;
+  } else {
+    // Raw VP8/VP8L bitstream (no header).
+    *is_lossless = VP8LCheckSignature(data, *data_size);
+    *chunk_size = *data_size;
+  }
+
+  return VP8_STATUS_OK;
+}
+
+//------------------------------------------------------------------------------
+
+// Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
+// 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
+// minimal amount will be read to fetch the remaining parameters.
+// If 'headers' is non-NULL this function will attempt to locate both alpha
+// data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
+static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
+                                          size_t data_size,
+                                          int* const width,
+                                          int* const height,
+                                          int* const has_alpha,
+                                          int* const has_animation,
+                                          WebPHeaderStructure* const headers) {
+  int found_riff = 0;
+  int found_vp8x = 0;
+  VP8StatusCode status;
+  WebPHeaderStructure hdrs;
+
+  if (data == NULL || data_size < RIFF_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;
+  }
+  memset(&hdrs, 0, sizeof(hdrs));
+  hdrs.data = data;
+  hdrs.data_size = data_size;
+
+  // Skip over RIFF header.
+  status = ParseRIFF(&data, &data_size, &hdrs.riff_size);
+  if (status != VP8_STATUS_OK) {
+    return status;   // Wrong RIFF header / insufficient data.
+  }
+  found_riff = (hdrs.riff_size > 0);
+
+  // Skip over VP8X.
+  {
+    uint32_t flags = 0;
+    status = ParseVP8X(&data, &data_size, &found_vp8x, width, height, &flags);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Wrong VP8X / insufficient data.
+    }
+    if (!found_riff && found_vp8x) {
+      // Note: This restriction may be removed in the future, if it becomes
+      // necessary to send VP8X chunk to the decoder.
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
+    if (has_animation != NULL) *has_animation = !!(flags & ANIMATION_FLAG);
+    if (found_vp8x && headers == NULL) {
+      return VP8_STATUS_OK;  // Return features from VP8X header.
+    }
+  }
+
+  if (data_size < TAG_SIZE) return VP8_STATUS_NOT_ENOUGH_DATA;
+
+  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
+  if ((found_riff && found_vp8x) ||
+      (!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
+    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
+                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Found an invalid chunk size / insufficient data.
+    }
+  }
+
+  // Skip over VP8/VP8L header.
+  status = ParseVP8Header(&data, &data_size, hdrs.riff_size,
+                          &hdrs.compressed_size, &hdrs.is_lossless);
+  if (status != VP8_STATUS_OK) {
+    return status;  // Wrong VP8/VP8L chunk-header / insufficient data.
+  }
+  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+
+  if (!hdrs.is_lossless) {
+    if (data_size < VP8_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8 data.
+    if (!VP8GetInfo(data, data_size,
+                    (uint32_t)hdrs.compressed_size, width, height)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  } else {
+    if (data_size < VP8L_FRAME_HEADER_SIZE) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+    // Validates raw VP8L data.
+    if (!VP8LGetInfo(data, data_size, width, height, has_alpha)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  }
+
+  if (has_alpha != NULL) {
+    // If the data did not contain a VP8X/VP8L chunk the only definitive way
+    // to set this is by looking for alpha data (from an ALPH chunk).
+    *has_alpha |= (hdrs.alpha_data != NULL);
+  }
+  if (headers != NULL) {
+    *headers = hdrs;
+    headers->offset = data - headers->data;
+    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
+    assert(headers->offset == headers->data_size - data_size);
+  }
+  return VP8_STATUS_OK;  // Return features from VP8 header.
+}
+
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  VP8StatusCode status;
+  int has_animation = 0;
+  assert(headers != NULL);
+  // fill out headers, ignore width/height/has_alpha.
+  status = ParseHeadersInternal(headers->data, headers->data_size,
+                                NULL, NULL, NULL, &has_animation, headers);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    // TODO(jzern): full support of animation frames will require API additions.
+    if (has_animation) {
+      status = VP8_STATUS_UNSUPPORTED_FEATURE;
+    }
+  }
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// WebPDecParams
+
+void WebPResetDecParams(WebPDecParams* const params) {
+  if (params) {
+    memset(params, 0, sizeof(*params));
+  }
+}
+
+//------------------------------------------------------------------------------
+// "Into" decoding variants
+
+// Main flow
+static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
+                                WebPDecParams* const params) {
+  VP8StatusCode status;
+  VP8Io io;
+  WebPHeaderStructure headers;
+
+  headers.data = data;
+  headers.data_size = data_size;
+  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
+  if (status != VP8_STATUS_OK) {
+    return status;
+  }
+
+  assert(params != NULL);
+  VP8InitIo(&io);
+  io.data = headers.data + headers.offset;
+  io.data_size = headers.data_size - headers.offset;
+  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
+
+  if (!headers.is_lossless) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+#ifdef WEBP_USE_THREAD
+    dec->use_threads_ = params->options && (params->options->use_threads > 0);
+#else
+    dec->use_threads_ = 0;
+#endif
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+
+    // Decode bitstream header, update io->width/io->height.
+    if (!VP8GetHeaders(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8Decode(dec, &io)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8Delete(dec);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    if (!VP8LDecodeHeader(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8LDecodeImage(dec)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8LDelete(dec);
+  }
+
+  if (status != VP8_STATUS_OK) {
+    WebPFreeDecBuffer(params->output);
+  }
+  return status;
+}
+
+// Helpers
+static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
+                                     const uint8_t* const data,
+                                     size_t data_size,
+                                     uint8_t* const rgba,
+                                     int stride, size_t size) {
+  WebPDecParams params;
+  WebPDecBuffer buf;
+  if (rgba == NULL) {
+    return NULL;
+  }
+  WebPInitDecBuffer(&buf);
+  WebPResetDecParams(&params);
+  params.output = &buf;
+  buf.colorspace    = colorspace;
+  buf.u.RGBA.rgba   = rgba;
+  buf.u.RGBA.stride = stride;
+  buf.u.RGBA.size   = size;
+  buf.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  return rgba;
+}
+
+uint8_t* WebPDecodeRGBInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
+}
+
+uint8_t* WebPDecodeRGBAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
+}
+
+uint8_t* WebPDecodeARGBInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_ARGB, data, data_size, output, stride, size);
+}
+
+uint8_t* WebPDecodeBGRInto(const uint8_t* data, size_t data_size,
+                           uint8_t* output, size_t size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
+}
+
+uint8_t* WebPDecodeBGRAInto(const uint8_t* data, size_t data_size,
+                            uint8_t* output, size_t size, int stride) {
+  return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
+}
+
+uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
+                           uint8_t* luma, size_t luma_size, int luma_stride,
+                           uint8_t* u, size_t u_size, int u_stride,
+                           uint8_t* v, size_t v_size, int v_stride) {
+  WebPDecParams params;
+  WebPDecBuffer output;
+  if (luma == NULL) return NULL;
+  WebPInitDecBuffer(&output);
+  WebPResetDecParams(&params);
+  params.output = &output;
+  output.colorspace      = MODE_YUV;
+  output.u.YUVA.y        = luma;
+  output.u.YUVA.y_stride = luma_stride;
+  output.u.YUVA.y_size   = luma_size;
+  output.u.YUVA.u        = u;
+  output.u.YUVA.u_stride = u_stride;
+  output.u.YUVA.u_size   = u_size;
+  output.u.YUVA.v        = v;
+  output.u.YUVA.v_stride = v_stride;
+  output.u.YUVA.v_size   = v_size;
+  output.is_external_memory = 1;
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  return luma;
+}
+
+//------------------------------------------------------------------------------
+
+static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
+                       size_t data_size, int* const width, int* const height,
+                       WebPDecBuffer* const keep_info) {
+  WebPDecParams params;
+  WebPDecBuffer output;
+
+  WebPInitDecBuffer(&output);
+  WebPResetDecParams(&params);
+  params.output = &output;
+  output.colorspace = mode;
+
+  // Retrieve (and report back) the required dimensions from bitstream.
+  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
+    return NULL;
+  }
+  if (width != NULL) *width = output.width;
+  if (height != NULL) *height = output.height;
+
+  // Decode
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  if (keep_info != NULL) {    // keep track of the side-info
+    WebPCopyDecBuffer(&output, keep_info);
+  }
+  // return decoded samples (don't clear 'output'!)
+  return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
+}
+
+uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                       int* width, int* height) {
+  return Decode(MODE_RGB, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
+                        int* width, int* height) {
+  return Decode(MODE_RGBA, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
+                        int* width, int* height) {
+  return Decode(MODE_ARGB, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
+                       int* width, int* height) {
+  return Decode(MODE_BGR, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                        int* width, int* height) {
+  return Decode(MODE_BGRA, data, data_size, width, height, NULL);
+}
+
+uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
+                       int* width, int* height, uint8_t** u, uint8_t** v,
+                       int* stride, int* uv_stride) {
+  WebPDecBuffer output;   // only to preserve the side-infos
+  uint8_t* const out = Decode(MODE_YUV, data, data_size,
+                              width, height, &output);
+
+  if (out != NULL) {
+    const WebPYUVABuffer* const buf = &output.u.YUVA;
+    *u = buf->u;
+    *v = buf->v;
+    *stride = buf->y_stride;
+    *uv_stride = buf->u_stride;
+    assert(buf->u_stride == buf->v_stride);
+  }
+  return out;
+}
+
+static void DefaultFeatures(WebPBitstreamFeatures* const features) {
+  assert(features != NULL);
+  memset(features, 0, sizeof(*features));
+  features->bitstream_version = 0;
+}
+
+static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
+                                 WebPBitstreamFeatures* const features) {
+  if (features == NULL || data == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  DefaultFeatures(features);
+
+  // Only parse enough of the data to retrieve the features.
+  return ParseHeadersInternal(data, data_size,
+                              &features->width, &features->height,
+                              &features->has_alpha, &features->has_animation,
+                              NULL);
+}
+
+//------------------------------------------------------------------------------
+// WebPGetInfo()
+
+int WebPGetInfo(const uint8_t* data, size_t data_size,
+                int* width, int* height) {
+  WebPBitstreamFeatures features;
+
+  if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) {
+    return 0;
+  }
+
+  if (width != NULL) {
+    *width  = features.width;
+  }
+  if (height != NULL) {
+    *height = features.height;
+  }
+
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Advance decoding API
+
+int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
+                                  int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;   // version mismatch
+  }
+  if (config == NULL) {
+    return 0;
+  }
+  memset(config, 0, sizeof(*config));
+  DefaultFeatures(&config->input);
+  WebPInitDecBuffer(&config->output);
+  return 1;
+}
+
+VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
+                                      WebPBitstreamFeatures* features,
+                                      int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return VP8_STATUS_INVALID_PARAM;   // version mismatch
+  }
+  if (features == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  return GetFeatures(data, data_size, features);
+}
+
+VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
+                         WebPDecoderConfig* config) {
+  WebPDecParams params;
+  VP8StatusCode status;
+
+  if (config == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  status = GetFeatures(data, data_size, &config->input);
+  if (status != VP8_STATUS_OK) {
+    if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Not-enough-data treated as error.
+    }
+    return status;
+  }
+
+  WebPResetDecParams(&params);
+  params.output = &config->output;
+  params.options = &config->options;
+  status = DecodeInto(data, data_size, &params);
+
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// Cropping and rescaling.
+
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    x = options->crop_left;
+    y = options->crop_top;
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420 or YUV422
+      x &= ~1;
+      y &= ~1;    // TODO(later): only for YUV420, not YUV422.
+    }
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dec/webpi.h
+++ b/3rdparty/libwebp/dec/webpi.h
@@ -0,0 +1,114 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Internal header: WebP decoding parameters and custom IO on buffer
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#include "../utils/rescaler.h"
+#include "./decode_vp8.h"
+
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling
+
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;                  // overall scratch memory for the output work.
+
+  OutputFunc emit;               // output RGB or YUV samples
+  OutputFunc emit_alpha;         // output alpha channel
+  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
+};
+
+// Should be called first, before any use of the WebPDecParams object.
+void WebPResetDecParams(WebPDecParams* const params);
+
+//------------------------------------------------------------------------------
+// Header parsing helpers
+
+// Structure storing a description of the RIFF headers.
+typedef struct {
+  const uint8_t* data;         // input buffer
+  size_t data_size;            // input buffer size
+  size_t offset;               // offset to main data chunk (VP8 or VP8L)
+  const uint8_t* alpha_data;   // points to alpha chunk (if present)
+  size_t alpha_data_size;      // alpha chunk size
+  size_t compressed_size;      // VP8/VP8L compressed data size
+  size_t riff_size;            // size of the riff payload (or 0 if absent)
+  int is_lossless;             // true if a VP8L chunk is present
+} WebPHeaderStructure;
+
+// Skips over all valid chunks prior to the first VP8/VP8L frame header.
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
+// fields are updated appropriately upon success.
+VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
+
+//------------------------------------------------------------------------------
+// Misc utils
+
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
+
+// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
+// to the *compressed* format, not the output one.
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.
+
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
+
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+// Copy and transfer ownership from src to dst (beware of parameter order!)
+void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
+
+
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_WEBPI_H_ */
--- a/3rdparty/libwebp/demux/demux.c
+++ b/3rdparty/libwebp/demux/demux.c
@@ -0,0 +1,951 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//  WebP container demux.
+//
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../utils/utils.h"
+#include "../webp/decode.h"     // WebPGetFeatures
+#include "../webp/demux.h"
+#include "../webp/format_constants.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define DMUX_MAJ_VERSION 0
+#define DMUX_MIN_VERSION 1
+#define DMUX_REV_VERSION 0
+
+typedef struct {
+  size_t start_;        // start location of the data
+  size_t end_;          // end location
+  size_t riff_end_;     // riff chunk end location, can be > end_.
+  size_t buf_size_;     // size of the buffer
+  const uint8_t* buf_;
+} MemBuffer;
+
+typedef struct {
+  size_t offset_;
+  size_t size_;
+} ChunkData;
+
+typedef struct Frame {
+  int x_offset_, y_offset_;
+  int width_, height_;
+  int duration_;
+  WebPMuxAnimDispose dispose_method_;
+  int is_fragment_;  // this is a frame fragment (and not a full frame).
+  int frame_num_;  // the referent frame number for use in assembling fragments.
+  int complete_;   // img_components_ contains a full image.
+  ChunkData img_components_[2];  // 0=VP8{,L} 1=ALPH
+  struct Frame* next_;
+} Frame;
+
+typedef struct Chunk {
+  ChunkData data_;
+  struct Chunk* next_;
+} Chunk;
+
+struct WebPDemuxer {
+  MemBuffer mem_;
+  WebPDemuxState state_;
+  int is_ext_format_;
+  uint32_t feature_flags_;
+  int canvas_width_, canvas_height_;
+  int loop_count_;
+  uint32_t bgcolor_;
+  int num_frames_;
+  Frame* frames_;
+  Frame** frames_tail_;
+  Chunk* chunks_;  // non-image chunks
+};
+
+typedef enum {
+  PARSE_OK,
+  PARSE_NEED_MORE_DATA,
+  PARSE_ERROR
+} ParseStatus;
+
+typedef struct ChunkParser {
+  uint8_t id[4];
+  ParseStatus (*parse)(WebPDemuxer* const dmux);
+  int (*valid)(const WebPDemuxer* const dmux);
+} ChunkParser;
+
+static ParseStatus ParseSingleImage(WebPDemuxer* const dmux);
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux);
+static int IsValidSimpleFormat(const WebPDemuxer* const dmux);
+static int IsValidExtendedFormat(const WebPDemuxer* const dmux);
+
+static const ChunkParser kMasterChunks[] = {
+  { { 'V', 'P', '8', ' ' }, ParseSingleImage, IsValidSimpleFormat },
+  { { 'V', 'P', '8', 'L' }, ParseSingleImage, IsValidSimpleFormat },
+  { { 'V', 'P', '8', 'X' }, ParseVP8X,        IsValidExtendedFormat },
+  { { '0', '0', '0', '0' }, NULL,             NULL },
+};
+
+//------------------------------------------------------------------------------
+
+int WebPGetDemuxVersion(void) {
+  return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
+}
+
+// -----------------------------------------------------------------------------
+// MemBuffer
+
+static int RemapMemBuffer(MemBuffer* const mem,
+                          const uint8_t* data, size_t size) {
+  if (size < mem->buf_size_) return 0;  // can't remap to a shorter buffer!
+
+  mem->buf_ = data;
+  mem->end_ = mem->buf_size_ = size;
+  return 1;
+}
+
+static int InitMemBuffer(MemBuffer* const mem,
+                         const uint8_t* data, size_t size) {
+  memset(mem, 0, sizeof(*mem));
+  return RemapMemBuffer(mem, data, size);
+}
+
+// Return the remaining data size available in 'mem'.
+static WEBP_INLINE size_t MemDataSize(const MemBuffer* const mem) {
+  return (mem->end_ - mem->start_);
+}
+
+// Return true if 'size' exceeds the end of the RIFF chunk.
+static WEBP_INLINE int SizeIsInvalid(const MemBuffer* const mem, size_t size) {
+  return (size > mem->riff_end_ - mem->start_);
+}
+
+static WEBP_INLINE void Skip(MemBuffer* const mem, size_t size) {
+  mem->start_ += size;
+}
+
+static WEBP_INLINE void Rewind(MemBuffer* const mem, size_t size) {
+  mem->start_ -= size;
+}
+
+static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
+  return mem->buf_ + mem->start_;
+}
+
+// Read from 'mem' and skip the read bytes.
+static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
+  const uint8_t byte = mem->buf_[mem->start_];
+  Skip(mem, 1);
+  return byte;
+}
+
+static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const int val = GetLE16(data);
+  Skip(mem, 2);
+  return val;
+}
+
+static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const int val = GetLE24(data);
+  Skip(mem, 3);
+  return val;
+}
+
+static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
+  const uint8_t* const data = mem->buf_ + mem->start_;
+  const uint32_t val = GetLE32(data);
+  Skip(mem, 4);
+  return val;
+}
+
+// -----------------------------------------------------------------------------
+// Secondary chunk parsing
+
+static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
+  Chunk** c = &dmux->chunks_;
+  while (*c != NULL) c = &(*c)->next_;
+  *c = chunk;
+  chunk->next_ = NULL;
+}
+
+// Add a frame to the end of the list, ensuring the last frame is complete.
+// Returns true on success, false otherwise.
+static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
+  const Frame* const last_frame = *dmux->frames_tail_;
+  if (last_frame != NULL && !last_frame->complete_) return 0;
+
+  *dmux->frames_tail_ = frame;
+  frame->next_ = NULL;
+  dmux->frames_tail_ = &frame->next_;
+  return 1;
+}
+
+// Store image bearing chunks to 'frame'.
+// If 'has_vp8l_alpha' is not NULL, it will be set to true if the frame is a
+// lossless image with alpha.
+static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
+                              MemBuffer* const mem, Frame* const frame,
+                              int* const has_vp8l_alpha) {
+  int alpha_chunks = 0;
+  int image_chunks = 0;
+  int done = (MemDataSize(mem) < min_size);
+  ParseStatus status = PARSE_OK;
+
+  if (has_vp8l_alpha != NULL) *has_vp8l_alpha = 0;  // Default.
+
+  if (done) return PARSE_NEED_MORE_DATA;
+
+  do {
+    const size_t chunk_start_offset = mem->start_;
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t payload_size = ReadLE32(mem);
+    const uint32_t payload_size_padded = payload_size + (payload_size & 1);
+    const size_t payload_available = (payload_size_padded > MemDataSize(mem))
+                                   ? MemDataSize(mem) : payload_size_padded;
+    const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
+
+    if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+    if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
+    if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
+
+    switch (fourcc) {
+      case MKFOURCC('A', 'L', 'P', 'H'):
+        if (alpha_chunks == 0) {
+          ++alpha_chunks;
+          frame->img_components_[1].offset_ = chunk_start_offset;
+          frame->img_components_[1].size_ = chunk_size;
+          frame->frame_num_ = frame_num;
+          Skip(mem, payload_available);
+        } else {
+          goto Done;
+        }
+        break;
+      case MKFOURCC('V', 'P', '8', 'L'):
+        if (alpha_chunks > 0) return PARSE_ERROR;  // VP8L has its own alpha
+        // fall through
+      case MKFOURCC('V', 'P', '8', ' '):
+        if (image_chunks == 0) {
+          // Extract the bitstream features, tolerating failures when the data
+          // is incomplete.
+          WebPBitstreamFeatures features;
+          const VP8StatusCode vp8_status =
+              WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
+                              &features);
+          if (status == PARSE_NEED_MORE_DATA &&
+              vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
+            return PARSE_NEED_MORE_DATA;
+          } else if (vp8_status != VP8_STATUS_OK) {
+            // We have enough data, and yet WebPGetFeatures() failed.
+            return PARSE_ERROR;
+          }
+          ++image_chunks;
+          frame->img_components_[0].offset_ = chunk_start_offset;
+          frame->img_components_[0].size_ = chunk_size;
+          frame->width_ = features.width;
+          frame->height_ = features.height;
+          if (has_vp8l_alpha != NULL) *has_vp8l_alpha = features.has_alpha;
+          frame->frame_num_ = frame_num;
+          frame->complete_ = (status == PARSE_OK);
+          Skip(mem, payload_available);
+        } else {
+          goto Done;
+        }
+        break;
+ Done:
+      default:
+        // Restore fourcc/size when moving up one level in parsing.
+        Rewind(mem, CHUNK_HEADER_SIZE);
+        done = 1;
+        break;
+    }
+
+    if (mem->start_ == mem->riff_end_) {
+      done = 1;
+    } else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
+      status = PARSE_NEED_MORE_DATA;
+    }
+  } while (!done && status == PARSE_OK);
+
+  return status;
+}
+
+// Creates a new Frame if 'actual_size' is within bounds and 'mem' contains
+// enough data ('min_size') to parse the payload.
+// Returns PARSE_OK on success with *frame pointing to the new Frame.
+// Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
+static ParseStatus NewFrame(const MemBuffer* const mem,
+                            uint32_t min_size, uint32_t actual_size,
+                            Frame** frame) {
+  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
+  if (actual_size < min_size) return PARSE_ERROR;
+  if (MemDataSize(mem) < min_size)  return PARSE_NEED_MORE_DATA;
+
+  *frame = (Frame*)calloc(1, sizeof(**frame));
+  return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
+}
+
+// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
+// 'frame_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseAnimationFrame(
+    WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
+  int added_frame = 0;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status =
+      NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
+  if (status != PARSE_OK) return status;
+
+  frame->x_offset_       = 2 * ReadLE24s(mem);
+  frame->y_offset_       = 2 * ReadLE24s(mem);
+  frame->width_          = 1 + ReadLE24s(mem);
+  frame->height_         = 1 + ReadLE24s(mem);
+  frame->duration_       = ReadLE24s(mem);
+  frame->dispose_method_ = (WebPMuxAnimDispose)(ReadByte(mem) & 1);
+  if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;
+  }
+
+  // Store a frame only if the animation flag is set there is some data for
+  // this frame is available.
+  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame,
+                      NULL);
+  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
+    added_frame = AddFrame(dmux, frame);
+    if (added_frame) {
+      ++dmux->num_frames_;
+    } else {
+      status = PARSE_ERROR;
+    }
+  }
+
+  if (!added_frame) free(frame);
+  return status;
+}
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+// Parse a 'FRGM' chunk and any image bearing chunks that immediately follow.
+// 'fragment_chunk_size' is the previously validated, padded chunk size.
+static ParseStatus ParseFragment(WebPDemuxer* const dmux,
+                                 uint32_t fragment_chunk_size) {
+  const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
+  int added_fragment = 0;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status =
+      NewFrame(mem, FRGM_CHUNK_SIZE, fragment_chunk_size, &frame);
+  if (status != PARSE_OK) return status;
+
+  frame->is_fragment_ = 1;
+  frame->x_offset_ = 2 * ReadLE24s(mem);
+  frame->y_offset_ = 2 * ReadLE24s(mem);
+
+  // Store a fragment only if the fragments flag is set there is some data for
+  // this fragment is available.
+  status = StoreFrame(frame_num, frgm_payload_size, mem, frame, NULL);
+  if (status != PARSE_ERROR && has_fragments && frame->frame_num_ > 0) {
+    added_fragment = AddFrame(dmux, frame);
+    if (!added_fragment) {
+      status = PARSE_ERROR;
+    } else {
+      dmux->num_frames_ = 1;
+    }
+  }
+
+  if (!added_fragment) free(frame);
+  return status;
+}
+#endif  // WEBP_EXPERIMENTAL_FEATURES
+
+// General chunk storage, starting with the header at 'start_offset', allowing
+// the user to request the payload via a fourcc string. 'size' includes the
+// header and the unpadded payload size.
+// Returns true on success, false otherwise.
+static int StoreChunk(WebPDemuxer* const dmux,
+                      size_t start_offset, uint32_t size) {
+  Chunk* const chunk = (Chunk*)calloc(1, sizeof(*chunk));
+  if (chunk == NULL) return 0;
+
+  chunk->data_.offset_ = start_offset;
+  chunk->data_.size_ = size;
+  AddChunk(dmux, chunk);
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// Primary chunk parsing
+
+static int ReadHeader(MemBuffer* const mem) {
+  const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
+  uint32_t riff_size;
+
+  // Basic file level validation.
+  if (MemDataSize(mem) < min_size) return 0;
+  if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
+      memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
+    return 0;
+  }
+
+  riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
+  if (riff_size < CHUNK_HEADER_SIZE) return 0;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
+
+  // There's no point in reading past the end of the RIFF chunk
+  mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
+  if (mem->buf_size_ > mem->riff_end_) {
+    mem->buf_size_ = mem->end_ = mem->riff_end_;
+  }
+
+  Skip(mem, RIFF_HEADER_SIZE);
+  return 1;
+}
+
+static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
+  const size_t min_size = CHUNK_HEADER_SIZE;
+  MemBuffer* const mem = &dmux->mem_;
+  Frame* frame;
+  ParseStatus status;
+  int has_vp8l_alpha = 0;  // Frame contains a lossless image with alpha.
+
+  if (dmux->frames_ != NULL) return PARSE_ERROR;
+  if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
+
+  frame = (Frame*)calloc(1, sizeof(*frame));
+  if (frame == NULL) return PARSE_ERROR;
+
+  // For the single image case we allow parsing of a partial frame, but we need
+  // at least CHUNK_HEADER_SIZE for parsing.
+  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame,
+                      &has_vp8l_alpha);
+  if (status != PARSE_ERROR) {
+    const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
+    // Clear any alpha when the alpha flag is missing.
+    if (!has_alpha && frame->img_components_[1].size_ > 0) {
+      frame->img_components_[1].offset_ = 0;
+      frame->img_components_[1].size_ = 0;
+    }
+
+    // Use the frame width/height as the canvas values for non-vp8x files.
+    // Also, set ALPHA_FLAG if this is a lossless image with alpha.
+    if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
+      dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+      dmux->canvas_width_ = frame->width_;
+      dmux->canvas_height_ = frame->height_;
+      dmux->feature_flags_ |= has_vp8l_alpha ? ALPHA_FLAG : 0;
+    }
+    AddFrame(dmux, frame);
+    dmux->num_frames_ = 1;
+  } else {
+    free(frame);
+  }
+
+  return status;
+}
+
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+  MemBuffer* const mem = &dmux->mem_;
+  int anim_chunks = 0;
+  uint32_t vp8x_size;
+  ParseStatus status = PARSE_OK;
+
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = ReadLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = ReadByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + ReadLE24s(mem);
+  dmux->canvas_height_ = 1 + ReadLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  do {
+    int store_chunk = 1;
+    const size_t chunk_start_offset = mem->start_;
+    const uint32_t fourcc = ReadLE32(mem);
+    const uint32_t chunk_size = ReadLE32(mem);
+    const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
+
+    if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+    if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
+
+    switch (fourcc) {
+      case MKFOURCC('V', 'P', '8', 'X'): {
+        return PARSE_ERROR;
+      }
+      case MKFOURCC('A', 'L', 'P', 'H'):
+      case MKFOURCC('V', 'P', '8', ' '):
+      case MKFOURCC('V', 'P', '8', 'L'): {
+        // check that this isn't an animation (all frames should be in an ANMF).
+        if (anim_chunks > 0) return PARSE_ERROR;
+
+        Rewind(mem, CHUNK_HEADER_SIZE);
+        status = ParseSingleImage(dmux);
+        break;
+      }
+      case MKFOURCC('A', 'N', 'I', 'M'): {
+        if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;
+
+        if (MemDataSize(mem) < chunk_size_padded) {
+          status = PARSE_NEED_MORE_DATA;
+        } else if (anim_chunks == 0) {
+          ++anim_chunks;
+          dmux->bgcolor_ = ReadLE32(mem);
+          dmux->loop_count_ = ReadLE16s(mem);
+          Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
+        } else {
+          store_chunk = 0;
+          goto Skip;
+        }
+        break;
+      }
+      case MKFOURCC('A', 'N', 'M', 'F'): {
+        if (anim_chunks == 0) return PARSE_ERROR;  // 'ANIM' precedes frames.
+        status = ParseAnimationFrame(dmux, chunk_size_padded);
+        break;
+      }
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      case MKFOURCC('F', 'R', 'G', 'M'): {
+        status = ParseFragment(dmux, chunk_size_padded);
+        break;
+      }
+#endif
+      case MKFOURCC('I', 'C', 'C', 'P'): {
+        store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
+        goto Skip;
+      }
+      case MKFOURCC('X', 'M', 'P', ' '): {
+        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
+        goto Skip;
+      }
+      case MKFOURCC('E', 'X', 'I', 'F'): {
+        store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
+        goto Skip;
+      }
+ Skip:
+      default: {
+        if (chunk_size_padded <= MemDataSize(mem)) {
+          if (store_chunk) {
+            // Store only the chunk header and unpadded size as only the payload
+            // will be returned to the user.
+            if (!StoreChunk(dmux, chunk_start_offset,
+                            CHUNK_HEADER_SIZE + chunk_size)) {
+              return PARSE_ERROR;
+            }
+          }
+          Skip(mem, chunk_size_padded);
+        } else {
+          status = PARSE_NEED_MORE_DATA;
+        }
+      }
+    }
+
+    if (mem->start_ == mem->riff_end_) {
+      break;
+    } else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
+      status = PARSE_NEED_MORE_DATA;
+    }
+  } while (status == PARSE_OK);
+
+  return status;
+}
+
+// -----------------------------------------------------------------------------
+// Format validation
+
+static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
+  const Frame* const frame = dmux->frames_;
+  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
+
+  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
+  if (dmux->state_ == WEBP_DEMUX_DONE && frame == NULL) return 0;
+
+  if (frame->width_ <= 0 || frame->height_ <= 0) return 0;
+  return 1;
+}
+
+static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
+  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const Frame* f;
+
+  if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
+
+  if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
+  if (dmux->loop_count_ < 0) return 0;
+  if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
+
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
+    const int cur_frame_set = f->frame_num_;
+    int frame_count = 0, fragment_count = 0;
+
+    // Check frame properties and if the image is composed of fragments that
+    // each fragment came from a fragment.
+    for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
+      const ChunkData* const image = f->img_components_;
+      const ChunkData* const alpha = f->img_components_ + 1;
+
+      if (!has_fragments && f->is_fragment_) return 0;
+      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (f->x_offset_ < 0 || f->y_offset_ < 0) return 0;
+      if (f->complete_) {
+        if (alpha->size_ == 0 && image->size_ == 0) return 0;
+        // Ensure alpha precedes image bitstream.
+        if (alpha->size_ > 0 && alpha->offset_ > image->offset_) {
+          return 0;
+        }
+
+        if (f->width_ <= 0 || f->height_ <= 0) return 0;
+      } else {
+        // There shouldn't be a partial frame in a complete file.
+        if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
+
+        // Ensure alpha precedes image bitstream.
+        if (alpha->size_ > 0 && image->size_ > 0 &&
+            alpha->offset_ > image->offset_) {
+          return 0;
+        }
+        // There shouldn't be any frames after an incomplete one.
+        if (f->next_ != NULL) return 0;
+      }
+
+      fragment_count += f->is_fragment_;
+      ++frame_count;
+    }
+    if (!has_fragments && frame_count > 1) return 0;
+    if (fragment_count > 0 && frame_count != fragment_count) return 0;
+    if (f == NULL) break;
+  }
+  return 1;
+}
+
+// -----------------------------------------------------------------------------
+// WebPDemuxer object
+
+static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
+  dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
+  dmux->loop_count_ = 1;
+  dmux->bgcolor_ = 0xFFFFFFFF;  // White background by default.
+  dmux->canvas_width_ = -1;
+  dmux->canvas_height_ = -1;
+  dmux->frames_tail_ = &dmux->frames_;
+  dmux->mem_ = *mem;
+}
+
+WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
+                               WebPDemuxState* state, int version) {
+  const ChunkParser* parser;
+  int partial;
+  ParseStatus status = PARSE_ERROR;
+  MemBuffer mem;
+  WebPDemuxer* dmux;
+
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
+  if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
+
+  if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
+  if (!ReadHeader(&mem)) return NULL;
+
+  partial = (mem.buf_size_ < mem.riff_end_);
+  if (!allow_partial && partial) return NULL;
+
+  dmux = (WebPDemuxer*)calloc(1, sizeof(*dmux));
+  if (dmux == NULL) return NULL;
+  InitDemux(dmux, &mem);
+
+  for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
+    if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
+      status = parser->parse(dmux);
+      if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
+      if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
+      if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
+      break;
+    }
+  }
+  if (state) *state = dmux->state_;
+
+  if (status == PARSE_ERROR) {
+    WebPDemuxDelete(dmux);
+    return NULL;
+  }
+  return dmux;
+}
+
+void WebPDemuxDelete(WebPDemuxer* dmux) {
+  Chunk* c;
+  Frame* f;
+  if (dmux == NULL) return;
+
+  for (f = dmux->frames_; f != NULL;) {
+    Frame* const cur_frame = f;
+    f = f->next_;
+    free(cur_frame);
+  }
+  for (c = dmux->chunks_; c != NULL;) {
+    Chunk* const cur_chunk = c;
+    c = c->next_;
+    free(cur_chunk);
+  }
+  free(dmux);
+}
+
+// -----------------------------------------------------------------------------
+
+uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
+  if (dmux == NULL) return 0;
+
+  switch (feature) {
+    case WEBP_FF_FORMAT_FLAGS:     return dmux->feature_flags_;
+    case WEBP_FF_CANVAS_WIDTH:     return (uint32_t)dmux->canvas_width_;
+    case WEBP_FF_CANVAS_HEIGHT:    return (uint32_t)dmux->canvas_height_;
+    case WEBP_FF_LOOP_COUNT:       return (uint32_t)dmux->loop_count_;
+    case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
+    case WEBP_FF_FRAME_COUNT:      return (uint32_t)dmux->num_frames_;
+  }
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// Frame iteration
+
+// Find the first 'frame_num' frame. There may be multiple such frames in a
+// fragmented frame.
+static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
+  const Frame* f;
+  for (f = dmux->frames_; f != NULL; f = f->next_) {
+    if (frame_num == f->frame_num_) break;
+  }
+  return f;
+}
+
+// Returns fragment 'fragment_num' and the total count.
+static const Frame* GetFragment(
+    const Frame* const frame_set, int fragment_num, int* const count) {
+  const int this_frame = frame_set->frame_num_;
+  const Frame* f = frame_set;
+  const Frame* fragment = NULL;
+  int total;
+
+  for (total = 0; f != NULL && f->frame_num_ == this_frame; f = f->next_) {
+    if (++total == fragment_num) fragment = f;
+  }
+  *count = total;
+  return fragment;
+}
+
+static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
+                                      const Frame* const frame,
+                                      size_t* const data_size) {
+  *data_size = 0;
+  if (frame != NULL) {
+    const ChunkData* const image = frame->img_components_;
+    const ChunkData* const alpha = frame->img_components_ + 1;
+    size_t start_offset = image->offset_;
+    *data_size = image->size_;
+
+    // if alpha exists it precedes image, update the size allowing for
+    // intervening chunks.
+    if (alpha->size_ > 0) {
+      const size_t inter_size = (image->offset_ > 0)
+                              ? image->offset_ - (alpha->offset_ + alpha->size_)
+                              : 0;
+      start_offset = alpha->offset_;
+      *data_size  += alpha->size_ + inter_size;
+    }
+    return mem_buf + start_offset;
+  }
+  return NULL;
+}
+
+// Create a whole 'frame' from VP8 (+ alpha) or lossless.
+static int SynthesizeFrame(const WebPDemuxer* const dmux,
+                           const Frame* const first_frame,
+                           int fragment_num, WebPIterator* const iter) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  int num_fragments;
+  size_t payload_size = 0;
+  const Frame* const fragment =
+      GetFragment(first_frame, fragment_num, &num_fragments);
+  const uint8_t* const payload =
+      GetFramePayload(mem_buf, fragment, &payload_size);
+  if (payload == NULL) return 0;
+  assert(first_frame != NULL);
+
+  iter->frame_num      = first_frame->frame_num_;
+  iter->num_frames     = dmux->num_frames_;
+  iter->fragment_num   = fragment_num;
+  iter->num_fragments  = num_fragments;
+  iter->x_offset       = fragment->x_offset_;
+  iter->y_offset       = fragment->y_offset_;
+  iter->width          = fragment->width_;
+  iter->height         = fragment->height_;
+  iter->duration       = fragment->duration_;
+  iter->dispose_method = fragment->dispose_method_;
+  iter->complete       = fragment->complete_;
+  iter->fragment.bytes = payload;
+  iter->fragment.size  = payload_size;
+  // TODO(jzern): adjust offsets for 'FRGM's embedded in 'ANMF's
+  return 1;
+}
+
+static int SetFrame(int frame_num, WebPIterator* const iter) {
+  const Frame* frame;
+  const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+  if (dmux == NULL || frame_num < 0) return 0;
+  if (frame_num > dmux->num_frames_) return 0;
+  if (frame_num == 0) frame_num = dmux->num_frames_;
+
+  frame = GetFrame(dmux, frame_num);
+  if (frame == NULL) return 0;
+
+  return SynthesizeFrame(dmux, frame, 1, iter);
+}
+
+int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
+  if (iter == NULL) return 0;
+
+  memset(iter, 0, sizeof(*iter));
+  iter->private_ = (void*)dmux;
+  return SetFrame(frame, iter);
+}
+
+int WebPDemuxNextFrame(WebPIterator* iter) {
+  if (iter == NULL) return 0;
+  return SetFrame(iter->frame_num + 1, iter);
+}
+
+int WebPDemuxPrevFrame(WebPIterator* iter) {
+  if (iter == NULL) return 0;
+  if (iter->frame_num <= 1) return 0;
+  return SetFrame(iter->frame_num - 1, iter);
+}
+
+int WebPDemuxSelectFragment(WebPIterator* iter, int fragment_num) {
+  if (iter != NULL && iter->private_ != NULL && fragment_num > 0) {
+    const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+    const Frame* const frame = GetFrame(dmux, iter->frame_num);
+    if (frame == NULL) return 0;
+
+    return SynthesizeFrame(dmux, frame, fragment_num, iter);
+  }
+  return 0;
+}
+
+void WebPDemuxReleaseIterator(WebPIterator* iter) {
+  (void)iter;
+}
+
+// -----------------------------------------------------------------------------
+// Chunk iteration
+
+static int ChunkCount(const WebPDemuxer* const dmux, const char fourcc[4]) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  const Chunk* c;
+  int count = 0;
+  for (c = dmux->chunks_; c != NULL; c = c->next_) {
+    const uint8_t* const header = mem_buf + c->data_.offset_;
+    if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
+  }
+  return count;
+}
+
+static const Chunk* GetChunk(const WebPDemuxer* const dmux,
+                             const char fourcc[4], int chunk_num) {
+  const uint8_t* const mem_buf = dmux->mem_.buf_;
+  const Chunk* c;
+  int count = 0;
+  for (c = dmux->chunks_; c != NULL; c = c->next_) {
+    const uint8_t* const header = mem_buf + c->data_.offset_;
+    if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
+    if (count == chunk_num) break;
+  }
+  return c;
+}
+
+static int SetChunk(const char fourcc[4], int chunk_num,
+                    WebPChunkIterator* const iter) {
+  const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
+  int count;
+
+  if (dmux == NULL || fourcc == NULL || chunk_num < 0) return 0;
+  count = ChunkCount(dmux, fourcc);
+  if (count == 0) return 0;
+  if (chunk_num == 0) chunk_num = count;
+
+  if (chunk_num <= count) {
+    const uint8_t* const mem_buf = dmux->mem_.buf_;
+    const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
+    iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
+    iter->chunk.size  = chunk->data_.size_ - CHUNK_HEADER_SIZE;
+    iter->num_chunks  = count;
+    iter->chunk_num   = chunk_num;
+    return 1;
+  }
+  return 0;
+}
+
+int WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                      const char fourcc[4], int chunk_num,
+                      WebPChunkIterator* iter) {
+  if (iter == NULL) return 0;
+
+  memset(iter, 0, sizeof(*iter));
+  iter->private_ = (void*)dmux;
+  return SetChunk(fourcc, chunk_num, iter);
+}
+
+int WebPDemuxNextChunk(WebPChunkIterator* iter) {
+  if (iter != NULL) {
+    const char* const fourcc =
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num + 1, iter);
+  }
+  return 0;
+}
+
+int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
+  if (iter != NULL && iter->chunk_num > 1) {
+    const char* const fourcc =
+        (const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
+    return SetChunk(fourcc, iter->chunk_num - 1, iter);
+  }
+  return 0;
+}
+
+void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
+  (void)iter;
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/cpu.c
+++ b/3rdparty/libwebp/dsp/cpu.c
@@ -0,0 +1,85 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "./dsp.h"
+
+#if defined(__ANDROID__)
+#include <cpu-features.h>
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type));
+}
+#elif defined(WEBP_MSC_SSE2)
+#define GetCPUInfo __cpuid
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+  if (feature == kSSE3) {
+    return 0 != (cpu_info[2] & 0x00000001);
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
+            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(__ARM_NEON__)
+// define a dummy function to enable turning off NEON at runtime by setting
+// VP8DecGetCPUInfo = NULL
+static int armCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/dec.c
+++ b/3rdparty/libwebp/dsp/dec.c
@@ -0,0 +1,737 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Speed-critical decoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "../dec/vp8i.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t abs0[255 + 255 + 1];     // abs(i)
+static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
+static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
+static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static void DspInitTables(void) {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255; ++i) {
+      abs0[255 + i] = (i < 0) ? -i : i;
+      abs1[255 + i] = abs0[255 + i] >> 1;
+    }
+    for (i = -1020; i <= 1020; ++i) {
+      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
+    }
+    for (i = -112; i <= 112; ++i) {
+      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
+    }
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+}
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#define STORE(x, y, v) \
+  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];    // [-4096, 4094]
+    const int b = in[0] - in[8];    // [-4095, 4095]
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    tmp[0] = a + d;   // [-7881, 7875]
+    tmp[1] = b + c;   // [-7878, 7878]
+    tmp[2] = b - c;   // [-7878, 7878]
+    tmp[3] = a - d;   // [-7877, 7879]
+    tmp += 4;
+    in++;
+  }
+  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
+  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // After the second pass, maximum interval is [-3794, 3794], assuming
+  // an input in [-2048, 2047] interval. We then need to add a dst value
+  // in the [0, 255] range.
+  // In the worst case scenario, the input to clip_8b() can be as large as
+  // [-60713, 60968].
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, 0, a + d);
+    STORE(1, 0, b + c);
+    STORE(2, 0, b - c);
+    STORE(3, 0, a - d);
+    tmp++;
+    dst += BPS;
+  }
+}
+#undef MUL
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformUV(const int16_t* in, uint8_t* dst) {
+  VP8Transform(in + 0 * 16, dst, 1);
+  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
+}
+
+static void TransformDC(const int16_t *in, uint8_t* dst) {
+  const int DC = in[0] + 4;
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 4; ++i) {
+      STORE(i, j, DC);
+    }
+  }
+}
+
+static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+  if (in[0 * 16]) TransformDC(in + 0 * 16, dst);
+  if (in[1 * 16]) TransformDC(in + 1 * 16, dst + 4);
+  if (in[2 * 16]) TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[3 * 16]) TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+}
+
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Paragraph 14.3
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+
+static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  int y;
+  for (y = 0; y < size; ++y) {
+    const uint8_t* const clip = clip0 + dst[-1];
+    int x;
+    for (x = 0; x < size; ++x) {
+      dst[x] = clip[top[x]];
+    }
+    dst += BPS;
+  }
+}
+static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t *dst) {     // vertical
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 16);
+  }
+}
+
+static void HE16(uint8_t *dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    memset(dst, dst[-1], 16);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(int v, uint8_t* dst) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memset(dst + j * BPS, v, 16);
+  }
+}
+
+static void DC16(uint8_t *dst) {    // DC
+  int DC = 16;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS] + dst[j - BPS];
+  }
+  Put16(DC >> 5, dst);
+}
+
+static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
+  int DC = 8;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    DC += dst[i - BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// 4x4
+
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t *dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, sizeof(vals));
+  }
+}
+
+static void HE4(uint8_t *dst) {    // horizontal
+  const int A = dst[-1 - BPS];
+  const int B = dst[-1];
+  const int C = dst[-1 + BPS];
+  const int D = dst[-1 + 2 * BPS];
+  const int E = dst[-1 + 3 * BPS];
+  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
+  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
+  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
+  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+}
+
+static void DC4(uint8_t *dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
+}
+
+static void RD4(uint8_t *dst) {   // Down-right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t *dst) {   // Down-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t *dst) {   // Vertical-Right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t *dst) {   // Vertical-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t *dst) {   // Horizontal-Up
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t *dst) {  // Horizontal-Down
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t *dst) {    // vertical
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 8);
+  }
+}
+
+static void HE8uv(uint8_t *dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memset(dst, dst[-1], 8);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
+  int j;
+#ifndef WEBP_REFERENCE_IMPLEMENTATION
+  const uint64_t v = (uint64_t)value * 0x0101010101010101ULL;
+  for (j = 0; j < 8; ++j) {
+    *(uint64_t*)(dst + j * BPS) = v;
+  }
+#else
+  for (j = 0; j < 8; ++j) memset(dst + j * BPS, value, 8);
+#endif
+}
+
+static void DC8uv(uint8_t *dst) {     // DC
+  int dc0 = 8;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 4, dst);
+}
+
+static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// default C implementations
+
+const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
+  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
+};
+
+const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
+  DC16, TM16, VE16, HE16,
+  DC16NoTop, DC16NoLeft, DC16NoTopLeft
+};
+
+const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
+  DC8uv, TM8uv, VE8uv, HE8uv,
+  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
+};
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
+  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  p[-step] = clip1[255 + p0 + a2];
+  p[    0] = clip1[255 + q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = sclip2[112 + ((a + 4) >> 3)];
+  const int a2 = sclip2[112 + ((a + 3) >> 3)];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2*step] = clip1[255 + p1 + a3];
+  p[-  step] = clip1[255 + p0 + a2];
+  p[      0] = clip1[255 + q0 - a1];
+  p[   step] = clip1[255 + q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
+  const int a = sclip1[1020 + 3 * (q0 - p0) + sclip1[1020 + p1 - q1]];
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3*step] = clip1[255 + p2 + a3];
+  p[-2*step] = clip1[255 + p1 + a2];
+  p[-  step] = clip1[255 + p0 + a1];
+  p[      0] = clip1[255 + q0 - a1];
+  p[   step] = clip1[255 + q1 - a2];
+  p[ 2*step] = clip1[255 + q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
+  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
+    return 0;
+  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
+         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
+         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+//------------------------------------------------------------------------------
+
+VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformUV;
+VP8DecIdct VP8TransformDC;
+VP8DecIdct VP8TransformDCUV;
+
+VP8LumaFilterFunc VP8VFilter16;
+VP8LumaFilterFunc VP8HFilter16;
+VP8ChromaFilterFunc VP8VFilter8;
+VP8ChromaFilterFunc VP8HFilter8;
+VP8LumaFilterFunc VP8VFilter16i;
+VP8LumaFilterFunc VP8HFilter16i;
+VP8ChromaFilterFunc VP8VFilter8i;
+VP8ChromaFilterFunc VP8HFilter8i;
+VP8SimpleFilterFunc VP8SimpleVFilter16;
+VP8SimpleFilterFunc VP8SimpleHFilter16;
+VP8SimpleFilterFunc VP8SimpleVFilter16i;
+VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+extern void VP8DspInitSSE2(void);
+extern void VP8DspInitNEON(void);
+
+void VP8DspInit(void) {
+  DspInitTables();
+
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8DspInitSSE2();
+    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
+#endif
+  }
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/dec_neon.c
+++ b/3rdparty/libwebp/dsp/dec_neon.c
@@ -0,0 +1,405 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of dsp functions and loop filtering.
+//
+// Authors: Somnath Banerjee (somnath@google.com)
+//          Johann Koenig (johannkoenig@google.com)
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_NEON)
+
+#include "../dec/vp8i.h"
+
+#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
+              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+
+#define FLIP_SIGN_BIT2(a, b, s)                                                \
+  "veor     " #a "," #a "," #s "               \n"                             \
+  "veor     " #b "," #b "," #s "               \n"                             \
+
+#define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
+  FLIP_SIGN_BIT2(a, b, s)                                                      \
+  FLIP_SIGN_BIT2(c, d, s)                                                      \
+
+#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
+  "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
+  "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
+  "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
+  "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
+  "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
+  "vdup.8     q14, " #thresh "            \n"                                  \
+  "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
+
+#define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
+  "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
+  "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
+  "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
+
+#define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
+  "vmov.i8    q15, #0x03                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
+  "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
+                                                                               \
+  "vmov.i8    q15, #0x04                  \n"                                  \
+  "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
+  "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
+  "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
+
+// Applies filter on 2 pixels (p0 and q0)
+#define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
+  NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
+  "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
+  FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
+  GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
+  "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
+  DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
+  FLIP_SIGN_BIT2(p0, q0, q10)
+
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
+  "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
+  "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
+
+#define STORE8x2(c1, c2, p, stride)                                            \
+  "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
+  "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
+
+//-----------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
+    "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
+    "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
+    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
+
+    "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
+    "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", QRegs
+  );
+}
+
+static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
+  __asm__ volatile (
+    "sub        r4, %[p], #2                   \n"  // base1 = p - 2
+    "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
+    "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
+
+    LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
+    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
+    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
+    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
+    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+
+    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+
+    "sub        %[p], %[p], #1                 \n"  // p - 1
+
+    "vswp        d5, d6                        \n"
+    STORE8x2(d4, d5, [%[p]], %[stride])
+    STORE8x2(d6, d7, [%[p]], %[stride])
+
+    : [p] "+r"(p)
+    : [stride] "r"(stride), [thresh] "r"(thresh)
+    : "memory", "r4", "r5", "r6", QRegs
+  );
+}
+
+static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16NEON(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16NEON(p, stride, thresh);
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Inverse transforms (Paragraph 14.4)
+
+static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+  const int kBPS = BPS;
+  const int16_t constants[] = {20091, 17734, 0, 0};
+  /* kC1, kC2. Padded because vld1.16 loads 8 bytes
+   * Technically these are unsigned but vqdmulh is only available in signed.
+   * vqdmulh returns high half (effectively >> 16) but also doubles the value,
+   * changing the >> 16 to >> 15 and requiring an additional >> 1.
+   * We use this to our advantage with kC2. The canonical value is 35468.
+   * However, the high bit is set so treating it as signed will give incorrect
+   * results. We avoid this by down shifting by 1 here to clear the highest bit.
+   * Combined with the doubling effect of vqdmulh we get >> 16.
+   * This can not be applied to kC1 because the lowest bit is set. Down shifting
+   * the constant would reduce precision.
+   */
+
+  /* libwebp uses a trick to avoid some extra addition that libvpx does.
+   * Instead of:
+   * temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+   * libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
+   * same issue with kC1 and vqdmulh that we work around by down shifting kC2
+   */
+
+  /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[constants]]        \n"
+
+    /* d2: in[0]
+     * d3: in[8]
+     * d4: in[4]
+     * d5: in[12]
+     */
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
+     * q9 = {in[4], in[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = in[0] + in[8]
+     * d23 = b = in[0] - in[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* The multiplication should be x * kC1 >> 16
+     * However, with vqdmulh we get x * kC1 * 2 >> 16
+     * (multiply, double, return high half)
+     * We avoided this in kC2 by pre-shifting the constant.
+     * q8 = in[4]/[12] * kC1 >> 16
+     */
+    "vshr.s16        q8, q8, #1                  \n"
+
+    /* Add {in[4], in[12]} back after the multiplication. This is handled by
+     * adding 1 << 16 to kC1 in the libwebp C code.
+     */
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+     * q9 = {tmp[4], tmp[12]} * kC2 >> 16
+     */
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    /* d22 = a = tmp[0] + tmp[8]
+     * d23 = b = tmp[0] - tmp[8]
+     */
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    /* See long winded explanations prior */
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    /* d20 = c = in[4]*kC2 - in[12]*kC1
+     * d21 = d = in[4]*kC1 + in[12]*kC2
+     */
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    /* d2 = tmp[0] = a + d
+     * d3 = tmp[1] = b + c
+     * d4 = tmp[2] = b - c
+     * d5 = tmp[3] = a - d
+     */
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
+
+    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+    /* (val) + 4 >> 3 */
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    /* Must accumulate before saturating */
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
+    : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
+  );
+}
+
+static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOneNEON(in, dst);
+  if (do_two) {
+    TransformOneNEON(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]   \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]   \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]   \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9"  // clobbered
+  );
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitNEON(void);
+
+void VP8DspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8Transform = TransformTwoNEON;
+  VP8TransformWHT = TransformWHT;
+
+  VP8SimpleVFilter16 = SimpleVFilter16NEON;
+  VP8SimpleHFilter16 = SimpleHFilter16NEON;
+  VP8SimpleVFilter16i = SimpleVFilter16iNEON;
+  VP8SimpleHFilter16i = SimpleHFilter16iNEON;
+#endif   // WEBP_USE_NEON
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/dec_sse2.c
+++ b/3rdparty/libwebp/dsp/dec_sse2.c
@@ -0,0 +1,908 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of some decoding functions (idct, loop filtering).
+//
+// Author: somnath@google.com (Somnath Banerjee)
+//         cduvivier@google.com (Christian Duvivier)
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE2)
+
+#include <emmintrin.h>
+#include "../dec/vp8i.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
+  // This implementation makes use of 16-bit fixed point versions of two
+  // multiply constants:
+  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
+  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
+  //
+  // To be able to use signed 16-bit integers, we use the following trick to
+  // have constants within range:
+  // - Associated constants are obtained by subtracting the 16-bit fixed point
+  //   version of one:
+  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
+  //      K1 = 85267  =>  k1 =  20091
+  //      K2 = 35468  =>  k2 = -30068
+  // - The multiplication of a variable by a constant become the sum of the
+  //   variable and the multiplication of that variable by the associated
+  //   constant:
+  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
+  const __m128i k1 = _mm_set1_epi16(20091);
+  const __m128i k2 = _mm_set1_epi16(-30068);
+  __m128i T0, T1, T2, T3;
+
+  // Load and concatenate the transform coefficients (we'll do two transforms
+  // in parallel). In the case of only one transform, the second half of the
+  // vectors will just contain random value we'll never use nor store.
+  __m128i in0, in1, in2, in3;
+  {
+    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    // a00 a10 a20 a30   x x x x
+    // a01 a11 a21 a31   x x x x
+    // a02 a12 a22 a32   x x x x
+    // a03 a13 a23 a33   x x x x
+    if (do_two) {
+      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      in0 = _mm_unpacklo_epi64(in0, inB0);
+      in1 = _mm_unpacklo_epi64(in1, inB1);
+      in2 = _mm_unpacklo_epi64(in2, inB2);
+      in3 = _mm_unpacklo_epi64(in3, inB3);
+      // a00 a10 a20 a30   b00 b10 b20 b30
+      // a01 a11 a21 a31   b01 b11 b21 b31
+      // a02 a12 a22 a32   b02 b12 b22 b32
+      // a03 a13 a23 a33   b03 b13 b23 b33
+    }
+  }
+
+  // Vertical pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i a = _mm_add_epi16(in0, in2);
+    const __m128i b = _mm_sub_epi16(in0, in2);
+    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
+    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
+    const __m128i c3 = _mm_sub_epi16(in1, in3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
+    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
+    const __m128i d3 = _mm_add_epi16(in1, in3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // First pass, c and d calculations are longer because of the "trick"
+    // multiplications.
+    const __m128i four = _mm_set1_epi16(4);
+    const __m128i dc = _mm_add_epi16(T0, four);
+    const __m128i a =  _mm_add_epi16(dc, T2);
+    const __m128i b =  _mm_sub_epi16(dc, T2);
+    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
+    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
+    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
+    const __m128i c3 = _mm_sub_epi16(T1, T3);
+    const __m128i c4 = _mm_sub_epi16(c1, c2);
+    const __m128i c = _mm_add_epi16(c3, c4);
+    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
+    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
+    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
+    const __m128i d3 = _mm_add_epi16(T1, T3);
+    const __m128i d4 = _mm_add_epi16(d1, d2);
+    const __m128i d = _mm_add_epi16(d3, d4);
+
+    // Second pass.
+    const __m128i tmp0 = _mm_add_epi16(a, d);
+    const __m128i tmp1 = _mm_add_epi16(b, c);
+    const __m128i tmp2 = _mm_sub_epi16(b, c);
+    const __m128i tmp3 = _mm_sub_epi16(a, d);
+    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
+    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
+    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
+    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
+
+    // Transpose the two 4x4.
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Add inverse transform to 'dst' and store.
+  {
+    const __m128i zero = _mm_setzero_si128();
+    // Load the reference(s).
+    __m128i dst0, dst1, dst2, dst3;
+    if (do_two) {
+      // Load eight bytes/pixels per line.
+      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
+      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
+      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
+      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+    } else {
+      // Load four bytes/pixels per line.
+      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
+      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
+      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
+      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+    }
+    // Convert to 16b.
+    dst0 = _mm_unpacklo_epi8(dst0, zero);
+    dst1 = _mm_unpacklo_epi8(dst1, zero);
+    dst2 = _mm_unpacklo_epi8(dst2, zero);
+    dst3 = _mm_unpacklo_epi8(dst3, zero);
+    // Add the inverse transform(s).
+    dst0 = _mm_add_epi16(dst0, T0);
+    dst1 = _mm_add_epi16(dst1, T1);
+    dst2 = _mm_add_epi16(dst2, T2);
+    dst3 = _mm_add_epi16(dst3, T3);
+    // Unsigned saturate to 8b.
+    dst0 = _mm_packus_epi16(dst0, dst0);
+    dst1 = _mm_packus_epi16(dst1, dst1);
+    dst2 = _mm_packus_epi16(dst2, dst2);
+    dst3 = _mm_packus_epi16(dst3, dst3);
+    // Store the results.
+    if (do_two) {
+      // Store eight bytes/pixels per line.
+      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
+      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
+      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
+      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+    } else {
+      // Store four bytes/pixels per line.
+      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
+      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
+      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
+      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Loop Filter (Paragraph 15)
+
+// Compute abs(p - q) = subs(p - q) OR subs(q - p)
+#define MM_ABS(p, q)  _mm_or_si128(                                            \
+    _mm_subs_epu8((q), (p)),                                                   \
+    _mm_subs_epu8((p), (q)))
+
+// Shift each byte of "a" by N bits while preserving by the sign bit.
+//
+// It first shifts the lower bytes of the words and then the upper bytes and
+// then merges the results together.
+#define SIGNED_SHIFT_N(a, N) {                                                 \
+  __m128i t = a;                                                               \
+  t = _mm_slli_epi16(t, 8);                                                    \
+  t = _mm_srai_epi16(t, N);                                                    \
+  t = _mm_srli_epi16(t, 8);                                                    \
+                                                                               \
+  a = _mm_srai_epi16(a, N + 8);                                                \
+  a = _mm_slli_epi16(a, 8);                                                    \
+                                                                               \
+  a = _mm_or_si128(t, a);                                                      \
+}
+
+#define FLIP_SIGN_BIT2(a, b) {                                                 \
+  a = _mm_xor_si128(a, sign_bit);                                              \
+  b = _mm_xor_si128(b, sign_bit);                                              \
+}
+
+#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
+  FLIP_SIGN_BIT2(a, b);                                                        \
+  FLIP_SIGN_BIT2(c, d);                                                        \
+}
+
+#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
+  const __m128i zero = _mm_setzero_si128();                                    \
+  const __m128i t_1 = MM_ABS(p1, p0);                                          \
+  const __m128i t_2 = MM_ABS(q1, q0);                                          \
+                                                                               \
+  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
+  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
+  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
+                                                                               \
+  not_hev = _mm_or_si128(t_3, t_4);                                            \
+  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
+}
+
+#define GET_BASE_DELTA(p1, p0, q0, q1, o) {                                    \
+  const __m128i qp0 = _mm_subs_epi8(q0, p0);  /* q0 - p0 */                    \
+  o = _mm_subs_epi8(p1, q1);            /* p1 - q1 */                          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 1 * (q0 - p0) */          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 2 * (q0 - p0) */          \
+  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 3 * (q0 - p0) */          \
+}
+
+#define DO_SIMPLE_FILTER(p0, q0, fl) {                                         \
+  const __m128i three = _mm_set1_epi8(3);                                      \
+  const __m128i four = _mm_set1_epi8(4);                                       \
+  __m128i v3 = _mm_adds_epi8(fl, three);                                       \
+  __m128i v4 = _mm_adds_epi8(fl, four);                                        \
+                                                                               \
+  /* Do +4 side */                                                             \
+  SIGNED_SHIFT_N(v4, 3);                /* v4 >> 3  */                         \
+  q0 = _mm_subs_epi8(q0, v4);           /* q0 -= v4 */                         \
+                                                                               \
+  /* Now do +3 side */                                                         \
+  SIGNED_SHIFT_N(v3, 3);                /* v3 >> 3  */                         \
+  p0 = _mm_adds_epi8(p0, v3);           /* p0 += v3 */                         \
+}
+
+// Updates values of 2 pixels at MB edge during complex filtering.
+// Update operations:
+// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
+#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
+  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
+  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
+  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
+  pi = _mm_adds_epi8(pi, delta);                                               \
+  qi = _mm_subs_epi8(qi, delta);                                               \
+}
+
+static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
+                        const __m128i* q1, int thresh, __m128i *mask) {
+  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
+  *mask = _mm_set1_epi8(0xFE);
+  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
+  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2
+
+  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
+  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
+  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2
+
+  t1 = _mm_set1_epi8(thresh);
+  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
+  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
+}
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// Applies filter on 2 pixels (p0 and q0)
+static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
+                                  const __m128i* q1, int thresh) {
+  __m128i a, mask;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
+  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
+
+  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+
+  // convert to signed values
+  FLIP_SIGN_BIT2(*p0, *q0);
+
+  GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
+  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
+  DO_SIMPLE_FILTER(*p0, *q0, a);
+
+  // unoffset
+  FLIP_SIGN_BIT2(*p0, *q0);
+}
+
+// Applies filter on 4 pixels (p1, p0, q0 and q1)
+static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1,
+                                  const __m128i* mask, int hev_thresh) {
+  __m128i not_hev;
+  __m128i t1, t2, t3;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+
+  // compute hev mask
+  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+
+  // convert to signed values
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+
+  t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
+  t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
+  t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
+  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
+  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
+
+  // Do +4 side
+  t2 = _mm_set1_epi8(4);
+  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  t3 = t2;                           // save t2
+  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2
+
+  // Now do +3 side
+  t2 = _mm_set1_epi8(3);
+  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
+  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
+
+  t2 = _mm_set1_epi8(1);
+  t3 = _mm_adds_epi8(t3, t2);
+  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
+
+  t3 = _mm_and_si128(not_hev, t3);   // if !hev
+  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
+  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
+
+  // unoffset
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+}
+
+// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
+static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
+                                  __m128i* q0, __m128i* q1, __m128i *q2,
+                                  const __m128i* mask, int hev_thresh) {
+  __m128i a, not_hev;
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+
+  // compute hev mask
+  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
+
+  // convert to signed values
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+
+  GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
+
+  { // do simple filter on pixels with hev
+    const __m128i m = _mm_andnot_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+    DO_SIMPLE_FILTER(*p0, *q0, f);
+  }
+  { // do strong filter on pixels with not hev
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i nine = _mm_set1_epi16(0x0900);
+    const __m128i sixty_three = _mm_set1_epi16(63);
+
+    const __m128i m = _mm_and_si128(not_hev, *mask);
+    const __m128i f = _mm_and_si128(a, m);
+    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
+    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
+
+    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
+    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
+    const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
+    const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18
+
+    const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
+    const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63
+
+    const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
+    const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63
+
+    const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
+    const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63
+
+    UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
+    UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
+    UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
+  }
+
+  // unoffset
+  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
+  FLIP_SIGN_BIT2(*p2, *q2);
+}
+
+// reads 8 rows across a vertical edge.
+//
+// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
+// two Load4x4() to avoid code duplication.
+static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
+                                __m128i* p, __m128i* q) {
+  __m128i t1, t2;
+
+  // Load 0th, 1st, 4th and 5th rows
+  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
+  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
+  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
+  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
+
+  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  t1 = _mm_unpacklo_epi8(r0, r1);
+
+  // Load 2nd, 3rd, 6th and 7th rows
+  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
+  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
+  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
+  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
+
+  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
+  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
+
+  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  t2 = _mm_unpacklo_epi8(r0, r1);
+
+  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  r0 = t1;
+  t1 = _mm_unpacklo_epi16(t1, t2);
+  t2 = _mm_unpackhi_epi16(r0, t2);
+
+  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  *p = _mm_unpacklo_epi32(t1, t2);
+  *q = _mm_unpackhi_epi32(t1, t2);
+}
+
+static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
+                                 int stride,
+                                 __m128i* p1, __m128i* p0,
+                                 __m128i* q0, __m128i* q1) {
+  __m128i t1, t2;
+  // Assume the pixels around the edge (|) are numbered as follows
+  //                00 01 | 02 03
+  //                10 11 | 12 13
+  //                 ...  |  ...
+  //                e0 e1 | e2 e3
+  //                f0 f1 | f2 f3
+  //
+  // r0 is pointing to the 0th row (00)
+  // r8 is pointing to the 8th row (80)
+
+  // Load
+  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
+  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
+  Load8x4(r0, stride, p1, q0);
+  Load8x4(r8, stride, p0, q1);
+
+  t1 = *p1;
+  t2 = *q0;
+  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+  *p1 = _mm_unpacklo_epi64(t1, *p0);
+  *p0 = _mm_unpackhi_epi64(t1, *p0);
+  *q0 = _mm_unpacklo_epi64(t2, *q1);
+  *q1 = _mm_unpackhi_epi64(t2, *q1);
+}
+
+static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    *x = _mm_srli_si128(*x, 4);
+  }
+}
+
+// Transpose back and store
+static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
+                                  __m128i* p1, __m128i* p0,
+                                  __m128i* q0, __m128i* q1) {
+  __m128i t1;
+
+  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi8(*p1, t1);
+  *p1 = _mm_unpackhi_epi8(*p1, t1);
+
+  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+  t1 = *q0;
+  *q0 = _mm_unpacklo_epi8(t1, *q1);
+  *q1 = _mm_unpackhi_epi8(t1, *q1);
+
+  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+  t1 = *p0;
+  *p0 = _mm_unpacklo_epi16(t1, *q0);
+  *q0 = _mm_unpackhi_epi16(t1, *q0);
+
+  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
+  t1 = *p1;
+  *p1 = _mm_unpacklo_epi16(t1, *q1);
+  *q1 = _mm_unpackhi_epi16(t1, *q1);
+
+  Store4x4(p0, r0, stride);
+  r0 += 4 * stride;
+  Store4x4(q0, r0, stride);
+
+  Store4x4(p1, r8, stride);
+  r8 += 4 * stride;
+  Store4x4(q1, r8, stride);
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
+  // Load
+  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
+  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
+  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
+  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
+
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-stride], p0);
+  _mm_storeu_si128((__m128i*)p, q0);
+}
+
+static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
+  __m128i p1, p0, q0, q1;
+
+  p -= 2;  // beginning of p1
+
+  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
+  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+}
+
+static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16SSE2(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16SSE2(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+#define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
+  m = MM_ABS(p3, p2);                                                          \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+}
+
+#define MAX_DIFF2(p3, p2, p1, p0, m) {                                         \
+  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
+}
+
+#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
+  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
+  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
+  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
+  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+}
+
+#define LOADUV_H_EDGE(p, u, v, stride) {                                       \
+  p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                               \
+  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)]));        \
+}
+
+#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
+  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
+  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
+  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
+  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+}
+
+#define STOREUV(p, u, v, stride) {                                             \
+  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
+  p = _mm_srli_si128(p, 8);                                                    \
+  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+}
+
+#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) {               \
+  __m128i fl_yes;                                                              \
+  const __m128i it = _mm_set1_epi8(ithresh);                                   \
+  mask = _mm_subs_epu8(mask, it);                                              \
+  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                            \
+  NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes);                            \
+  mask = _mm_and_si128(mask, fl_yes);                                          \
+}
+
+// on macroblock edges
+static void VFilter16SSE2(uint8_t* p, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i t1;
+  __m128i mask;
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
+  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+  _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+  _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
+}
+
+static void HFilter16SSE2(uint8_t* p, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const b = p - 4;
+  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
+  Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
+}
+
+// on three inner edges
+static void VFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    // Load p3, p2, p1, p0
+    LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    p += 4 * stride;
+
+    // Load q0, q1, q2, q3
+    LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    // Store
+    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
+    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
+    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
+    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
+  }
+}
+
+static void HFilter16iSSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
+  int k;
+  uint8_t* b;
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  for (k = 3; k > 0; --k) {
+    b = p;
+    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
+    MAX_DIFF1(t2, t1, p1, p0, mask);
+
+    b += 4;  // beginning of q0
+    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+    MAX_DIFF2(t2, t1, q1, q0, mask);
+
+    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+    b -= 2;  // beginning of p1
+    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
+
+    p += 4;
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, p2, p1, p0, q0, q1, q2;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
+  MAX_DIFF1(t1, p2, p1, p0, mask);
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
+  MAX_DIFF2(t1, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p2, u, v, -3 * stride);
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+  STOREUV(q2, u, v, 2 * stride);
+}
+
+static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  uint8_t* const tu = u - 4;
+  uint8_t* const tv = v - 4;
+  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  MAX_DIFF1(p3, p2, p1, p0, mask);
+
+  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  MAX_DIFF2(q3, q2, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+
+  Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
+  Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
+}
+
+static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+
+  // Load p3, p2, p1, p0
+  LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4 * stride;
+  v += 4 * stride;
+
+  // Load q0, q1, q2, q3
+  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  // Store
+  STOREUV(p1, u, v, -2 * stride);
+  STOREUV(p0, u, v, -1 * stride);
+  STOREUV(q0, u, v, 0 * stride);
+  STOREUV(q1, u, v, 1 * stride);
+}
+
+static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
+  __m128i mask;
+  __m128i t1, t2, p1, p0, q0, q1;
+  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  MAX_DIFF1(t2, t1, p1, p0, mask);
+
+  u += 4;  // beginning of q0
+  v += 4;
+  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  MAX_DIFF2(t2, t1, q1, q0, mask);
+
+  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
+  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+
+  u -= 2;  // beginning of p1
+  v -= 2;
+  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
+}
+
+#endif   // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE2(void);
+
+void VP8DspInitSSE2(void) {
+#if defined(WEBP_USE_SSE2)
+  VP8Transform = TransformSSE2;
+
+  VP8VFilter16 = VFilter16SSE2;
+  VP8HFilter16 = HFilter16SSE2;
+  VP8VFilter8 = VFilter8SSE2;
+  VP8HFilter8 = HFilter8SSE2;
+  VP8VFilter16i = VFilter16iSSE2;
+  VP8HFilter16i = HFilter16iSSE2;
+  VP8VFilter8i = VFilter8iSSE2;
+  VP8HFilter8i = HFilter8iSSE2;
+
+  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
+#endif   // WEBP_USE_SSE2
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/dsp.h
+++ b/3rdparty/libwebp/dsp/dsp.h
@@ -0,0 +1,213 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+//   Speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_DSP_H_
+#define WEBP_DSP_DSP_H_
+
+#include "../webp/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// CPU detection
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2)
+#define WEBP_USE_SSE2
+#endif
+
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+#define WEBP_ANDROID_NEON  // Android targets that might support NEON
+#endif
+
+#if defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON)
+#define WEBP_USE_NEON
+#endif
+
+typedef enum {
+  kSSE2,
+  kSSE3,
+  kNEON
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+extern VP8CPUInfo VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Encoding
+
+// Transforms
+// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
+//          will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
+typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                        int do_two);
+typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
+typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
+extern VP8Idct VP8ITransform;
+extern VP8Fdct VP8FTransform;
+extern VP8WHT VP8ITransformWHT;
+extern VP8WHT VP8FTransformWHT;
+// Predictions
+// *dst is the destination block. *top and *left can be NULL.
+typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left,
+                              const uint8_t* top);
+typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top);
+extern VP8Intra4Preds VP8EncPredLuma4;
+extern VP8IntraPreds VP8EncPredLuma16;
+extern VP8IntraPreds VP8EncPredChroma8;
+
+typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
+extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
+typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
+                          const uint16_t* const weights);
+extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
+
+typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
+extern VP8BlockCopy VP8Copy4x4;
+// Quantization
+struct VP8Matrix;   // forward declaration
+typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
+                                int n, const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlock VP8EncQuantizeBlock;
+
+// Collect histogram for susceptibility calculation and accumulate in histo[].
+struct VP8Histogram;
+typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
+                          int start_block, int end_block,
+                          struct VP8Histogram* const histo);
+extern const int VP8DspScan[16 + 4 + 4];
+extern VP8CHisto VP8CollectHistogram;
+
+void VP8EncDspInit(void);   // must be called before using any of the above
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformUV;
+extern VP8DecIdct VP8TransformDC;
+extern VP8DecIdct VP8TransformDCUV;
+extern VP8WHT VP8TransformWHT;
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t* dst);
+extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outer edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+// must be called before anything using the above
+void VP8DspInit(void);
+
+//------------------------------------------------------------------------------
+// WebP I/O
+
+#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+typedef void (*WebPUpsampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* cur_u, const uint8_t* cur_v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB(A) modes
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+// Initializes SSE2 version of the fancy upsamplers.
+void WebPInitUpsamplersSSE2(void);
+
+// NEON version
+void WebPInitUpsamplersNEON(void);
+
+#endif    // FANCY_UPSAMPLING
+
+// Point-sampling methods.
+typedef void (*WebPSampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* u, const uint8_t* v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */];
+
+// General function for converting two lines of ARGB or RGBA.
+// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
+// as 0x00, 0x00, 0x00, 0xff (little endian).
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
+
+// YUV444->RGB converters
+typedef void (*WebPYUV444Converter)(const uint8_t* y,
+                                    const uint8_t* u, const uint8_t* v,
+                                    uint8_t* dst, int len);
+
+extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+
+// Main function to be called
+void WebPInitUpsamplers(void);
+
+//------------------------------------------------------------------------------
+// Pre-multiply planes with alpha values
+
+// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
+// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
+extern void (*WebPApplyAlphaMultiply)(
+    uint8_t* rgba, int alpha_first, int w, int h, int stride);
+
+// Same, buf specifically for RGBA4444 format
+extern void (*WebPApplyAlphaMultiply4444)(
+    uint8_t* rgba4444, int w, int h, int stride);
+
+// To be called first before using the above.
+void WebPInitPremultiply(void);
+
+void WebPInitPremultiplySSE2(void);   // should not be called directly.
+void WebPInitPremultiplyNEON(void);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_DSP_H_ */
--- a/3rdparty/libwebp/dsp/enc.c
+++ b/3rdparty/libwebp/dsp/enc.c
@@ -0,0 +1,728 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// Speed-critical encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>  // for abs()
+#include "./dsp.h"
+#include "../enc/vp8enci.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int clip_max(int v, int max) {
+  return (v > max) ? max : v;
+}
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
+
+const int VP8DspScan[16 + 4 + 4] = {
+  // Luma
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+
+  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
+  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
+};
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
+  for (j = start_block; j < end_block; ++j) {
+    int k;
+    int16_t out[16];
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
+      histo->distribution[clipped_value]++;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// run-time tables (~4k)
+
+static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+static void InitTables(void) {
+  if (!tables_ok) {
+    int i;
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = clip_8b(i);
+    }
+    tables_ok = 1;
+  }
+}
+
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#define STORE(x, y, v) \
+  dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];
+    const int b = in[0] - in[8];
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
+    tmp[0] = a + d;
+    tmp[1] = b + c;
+    tmp[2] = b - c;
+    tmp[3] = a - d;
+    tmp += 4;
+    in++;
+  }
+
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, i, a + d);
+    STORE(1, i, b + c);
+    STORE(2, i, b - c);
+    STORE(3, i, a - d);
+    tmp++;
+  }
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  int i;
+  int tmp[16];
+  for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
+    const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
+    const int d1 = src[1] - ref[1];
+    const int d2 = src[2] - ref[2];
+    const int d3 = src[3] - ref[3];
+    const int a0 = (d0 + d3);         // 10b                      [-510,510]
+    const int a1 = (d1 + d2);
+    const int a2 = (d1 - d2);
+    const int a3 = (d0 - d3);
+    tmp[0 + i * 4] = (a0 + a1) << 3;  // 14b                      [-8160,8160]
+    tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
+    tmp[2 + i * 4] = (a0 - a1) << 3;
+    tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
+    const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
+    const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
+    const int a3 = (tmp[0 + i] - tmp[12 + i]);
+    out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
+    out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
+    out[8 + i] = (a0 - a1 + 7) >> 4;
+    out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
+  }
+}
+
+static void ITransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i, in += 64) {
+    const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
+    const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
+    const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
+    const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
+    tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int a0 = (tmp[0 + i] + tmp[8 + i]);
+    const int a1 = (tmp[4 + i] + tmp[12+ i]);
+    const int a2 = (tmp[4 + i] - tmp[12+ i]);
+    const int a3 = (tmp[0 + i] - tmp[8 + i]);
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+    out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
+    out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
+    out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
+    out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
+  }
+}
+
+#undef MUL
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+  int j;
+  for (j = 0; j < size; ++j) {
+    memset(dst + j * BPS, value, size);
+  }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
+  int j;
+  if (top) {
+    for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
+  } else {
+    Fill(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
+  if (left) {
+    int j;
+    for (j = 0; j < size; ++j) {
+      memset(dst + j * BPS, left[j], size);
+    }
+  } else {
+    Fill(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
+  int y;
+  if (left) {
+    if (top) {
+      const uint8_t* const clip = clip1 + 255 - left[-1];
+      for (y = 0; y < size; ++y) {
+        const uint8_t* const clip_table = clip + left[y];
+        int x;
+        for (x = 0; x < size; ++x) {
+          dst[x] = clip_table[top[x]];
+        }
+        dst += BPS;
+      }
+    } else {
+      HorizontalPred(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top) {
+      VerticalPred(dst, top, size);
+    } else {
+      Fill(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top,
+                               int size, int round, int shift) {
+  int DC = 0;
+  int j;
+  if (top) {
+    for (j = 0; j < size; ++j) DC += top[j];
+    if (left) {   // top and left present
+      for (j = 0; j < size; ++j) DC += left[j];
+    } else {      // top, but no left
+      DC += DC;
+    }
+    DC = (DC + round) >> shift;
+  } else if (left) {   // left but no top
+    for (j = 0; j < size; ++j) DC += left[j];
+    DC += DC;
+    DC = (DC + round) >> shift;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  Fill(dst, DC, size);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top) top += 8;
+  if (left) left += 16;
+  DCMode(C8DC8 + dst, left, top, 8, 8, 4);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DCMode(I16DC16 + dst, left, top, 16, 16, 5);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint8_t vals[4] = {
+    AVG3(top[-1], top[0], top[1]),
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, 4);
+  }
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
+  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
+  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
+  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill(dst, dc >> 3, 4);
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
+  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
+  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
+  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
+  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
+  DST(3, 0)                                     = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
+  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
+  DST(3, 3)                                     = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int x, y;
+  const uint8_t* const clip = clip1 + 255 - top[-1];
+  for (y = 0; y < 4; ++y) {
+    const uint8_t* const clip_table = clip + top[-2 - y];
+    for (x = 0; x < 4; ++x) {
+      dst[x] = clip_table[top[x]];
+    }
+    dst += BPS;
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
+                              int w, int h) {
+  int count = 0;
+  int y, x;
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const int diff = (int)a[x] - b[x];
+      count += diff * diff;
+    }
+    a += BPS;
+    b += BPS;
+  }
+  return count;
+}
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 16);
+}
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 16, 8);
+}
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 8, 8);
+}
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  return GetSSE(a, b, 4, 4);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum = 0;
+  int tmp[16];
+  int i;
+  // horizontal pass
+  for (i = 0; i < 4; ++i, in += BPS) {
+    const int a0 = in[0] + in[2];
+    const int a1 = in[1] + in[3];
+    const int a2 = in[1] - in[3];
+    const int a3 = in[0] - in[2];
+    tmp[0 + i * 4] = a0 + a1;
+    tmp[1 + i * 4] = a3 + a2;
+    tmp[2 + i * 4] = a3 - a2;
+    tmp[3 + i * 4] = a0 - a1;
+  }
+  // vertical pass
+  for (i = 0; i < 4; ++i, ++w) {
+    const int a0 = tmp[0 + i] + tmp[8 + i];
+    const int a1 = tmp[4 + i] + tmp[12+ i];
+    const int a2 = tmp[4 + i] - tmp[12+ i];
+    const int a3 = tmp[0 + i] - tmp[8 + i];
+    const int b0 = a0 + a1;
+    const int b1 = a3 + a2;
+    const int b2 = a3 - a2;
+    const int b3 = a0 - a1;
+
+    sum += w[ 0] * abs(b0);
+    sum += w[ 4] * abs(b1);
+    sum += w[ 8] * abs(b2);
+    sum += w[12] * abs(b3);
+  }
+  return sum;
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+// Simple quantization
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         int n, const VP8Matrix* const mtx) {
+  int last = -1;
+  for (; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    if (coeff > mtx->zthresh_[j]) {
+      const int Q = mtx->q_[j];
+      const int iQ = mtx->iq_[j];
+      const int B = mtx->bias_[j];
+      out[n] = QUANTDIV(coeff, iQ, B);
+      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
+      if (sign) out[n] = -out[n];
+      in[j] = out[n] * Q;
+      if (out[n]) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
+//------------------------------------------------------------------------------
+// Block copy
+
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memcpy(dst, src, size);
+    src += BPS;
+    dst += BPS;
+  }
+}
+
+static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+
+//------------------------------------------------------------------------------
+// Initialization
+
+// Speed-critical function pointers. We have to initialize them to the default
+// implementations within VP8EncDspInit().
+VP8CHisto VP8CollectHistogram;
+VP8Idct VP8ITransform;
+VP8Fdct VP8FTransform;
+VP8WHT VP8ITransformWHT;
+VP8WHT VP8FTransformWHT;
+VP8Intra4Preds VP8EncPredLuma4;
+VP8IntraPreds VP8EncPredLuma16;
+VP8IntraPreds VP8EncPredChroma8;
+VP8Metric VP8SSE16x16;
+VP8Metric VP8SSE8x8;
+VP8Metric VP8SSE16x8;
+VP8Metric VP8SSE4x4;
+VP8WMetric VP8TDisto4x4;
+VP8WMetric VP8TDisto16x16;
+VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8BlockCopy VP8Copy4x4;
+
+extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitNEON(void);
+
+void VP8EncDspInit(void) {
+  InitTables();
+
+  // default C implementations
+  VP8CollectHistogram = CollectHistogram;
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8Copy4x4 = Copy4x4;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspInitSSE2();
+    }
+#elif defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8EncDspInitNEON();
+    }
+#endif
+  }
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/3rdparty/libwebp/dsp/enc_neon.c
+++ b/3rdparty/libwebp/dsp/enc_neon.c
@@ -0,0 +1,661 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// This code is licensed under the same terms as WebM:
+//  Software License Agreement:  http://www.webmproject.org/license/software/
+//  Additional IP Rights Grant:  http://www.webmproject.org/license/additional/
+// -----------------------------------------------------------------------------
+//
+// ARM NEON version of speed-critical encoding functions.
+//
+// adapted from libvpx (http://www.webmproject.org/code/)
+
+#include "./dsp.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_NEON)
+
+#include "../enc/vp8enci.h"
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+// Inverse transform.
+// This code is pretty much the same as TransformOneNEON in the decoder, except
+// for subtraction to *ref. See the comments there for algorithmic explanations.
+static void ITransformOne(const uint8_t* ref,
+                          const int16_t* in, uint8_t* dst) {
+  const int kBPS = BPS;
+  const int16_t kC1C2[] = { 20091, 17734, 0, 0 };  // kC1 / (kC2 >> 1) / 0 / 0
+
+  __asm__ volatile (
+    "vld1.16         {q1, q2}, [%[in]]           \n"
+    "vld1.16         {d0}, [%[kC1C2]]            \n"
+
+    // d2: in[0]
+    // d3: in[8]
+    // d4: in[4]
+    // d5: in[12]
+    "vswp            d3, d4                      \n"
+
+    // q8 = {in[4], in[12]} * kC1 * 2 >> 16
+    // q9 = {in[4], in[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = in[0] + in[8]
+    // d23 = b = in[0] - in[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    //  q8 = in[4]/[12] * kC1 >> 16
+    "vshr.s16        q8, q8, #1                  \n"
+
+    // Add {in[4], in[12]} back after the multiplication.
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    "vswp            d3, d4                      \n"
+
+    // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
+    // q9 = {tmp[4], tmp[12]} * kC2 >> 16
+    "vqdmulh.s16     q8, q2, d0[0]               \n"
+    "vqdmulh.s16     q9, q2, d0[1]               \n"
+
+    // d22 = a = tmp[0] + tmp[8]
+    // d23 = b = tmp[0] - tmp[8]
+    "vqadd.s16       d22, d2, d3                 \n"
+    "vqsub.s16       d23, d2, d3                 \n"
+
+    "vshr.s16        q8, q8, #1                  \n"
+    "vqadd.s16       q8, q2, q8                  \n"
+
+    // d20 = c = in[4]*kC2 - in[12]*kC1
+    // d21 = d = in[4]*kC1 + in[12]*kC2
+    "vqsub.s16       d20, d18, d17               \n"
+    "vqadd.s16       d21, d19, d16               \n"
+
+    // d2 = tmp[0] = a + d
+    // d3 = tmp[1] = b + c
+    // d4 = tmp[2] = b - c
+    // d5 = tmp[3] = a - d
+    "vqadd.s16       d2, d22, d21                \n"
+    "vqadd.s16       d3, d23, d20                \n"
+    "vqsub.s16       d4, d23, d20                \n"
+    "vqsub.s16       d5, d22, d21                \n"
+
+    "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
+    "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
+
+    "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
+
+    // (val) + 4 >> 3
+    "vrshr.s16       d2, d2, #3                  \n"
+    "vrshr.s16       d3, d3, #3                  \n"
+    "vrshr.s16       d4, d4, #3                  \n"
+    "vrshr.s16       d5, d5, #3                  \n"
+
+    "vzip.16         q1, q2                      \n"
+    "vzip.16         q1, q2                      \n"
+
+    // Must accumulate before saturating
+    "vmovl.u8        q8, d6                      \n"
+    "vmovl.u8        q9, d7                      \n"
+
+    "vqadd.s16       q1, q1, q8                  \n"
+    "vqadd.s16       q2, q2, q9                  \n"
+
+    "vqmovun.s16     d0, q1                      \n"
+    "vqmovun.s16     d1, q2                      \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+
+    : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
+    : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
+    : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
+  );
+}
+
+static void ITransform(const uint8_t* ref,
+                       const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+// Same code as dec_neon.c
+static void ITransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;  // The store is only incrementing the pointer as if we
+                         // had stored a single byte.
+  __asm__ volatile (
+    // part 1
+    // load data into q0, q1
+    "vld1.16         {q0, q1}, [%[in]]           \n"
+
+    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
+    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
+    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+
+    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
+    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
+    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+
+    // Transpose
+    // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
+    // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
+    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vtrn.32         q0, q1                      \n"
+    "vtrn.32         q2, q3                      \n"
+
+    "vmov.s32        q4, #3                      \n" // dc = 3
+    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
+    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
+    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+
+    "vadd.s32        q0, q6, q7                  \n"
+    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q1, q9, q8                  \n"
+    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
+    "vsub.s32        q2, q6, q7                  \n"
+    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vsub.s32        q3, q9, q8                  \n"
+    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+
+    // set the results to output
+    "vst1.16         d0[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[0], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[1], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[2], [%[out]], %[kStep]      \n"
+    "vst1.16         d0[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d1[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d2[3], [%[out]], %[kStep]      \n"
+    "vst1.16         d3[3], [%[out]], %[kStep]      \n"
+
+    : [out] "+r"(out)  // modified registers
+    : [in] "r"(in), [kStep] "r"(kStep)  // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4",
+      "q5", "q6", "q7", "q8", "q9" // clobbered
+  );
+}
+
+// Forward transform.
+
+// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
+static const int16_t kCoeff16[] = {
+  5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
+};
+static const int32_t kCoeff32[] = {
+   1812,  1812,  1812,  1812,
+    937,   937,   937,   937,
+  12000, 12000, 12000, 12000,
+  51000, 51000, 51000, 51000
+};
+
+static void FTransform(const uint8_t* src, const uint8_t* ref,
+                       int16_t* out) {
+  const int kBPS = BPS;
+  const uint8_t* src_ptr = src;
+  const uint8_t* ref_ptr = ref;
+  const int16_t* coeff16 = kCoeff16;
+  const int32_t* coeff32 = kCoeff32;
+
+  __asm__ volatile (
+    // load src into q4, q5 in high half
+    "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
+    "vld1.8 {d11}, [%[src_ptr]]               \n"
+
+    // load ref into q6, q7 in high half
+    "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
+    "vld1.8 {d15}, [%[ref_ptr]]               \n"
+
+    // Pack the high values in to q4 and q6
+    "vtrn.32     q4, q5                       \n"
+    "vtrn.32     q6, q7                       \n"
+
+    // d[0-3] = src - ref
+    "vsubl.u8    q0, d8, d12                  \n"
+    "vsubl.u8    q1, d9, d13                  \n"
+
+    // load coeff16 into q8(d16=5352, d17=2217)
+    "vld1.16     {q8}, [%[coeff16]]           \n"
+
+    // load coeff32 high half into q9 = 1812, q10 = 937
+    "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
+
+    // load coeff32 low half into q11=12000, q12=51000
+    "vld1.32     {q11,q12}, [%[coeff32]]      \n"
+
+    // part 1
+    // Transpose. Register dN is the same as dN in C
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
+    "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
+    "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
+    "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
+
+    "vadd.s16        d0, d4, d5               \n" // a0 + a1
+    "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
+    "vsub.s16        d2, d4, d5               \n" // a0 - a1
+    "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
+
+    "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
+    "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
+    "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
+    "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
+
+    // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
+    // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
+    "vshrn.s32       d1, q9, #9               \n"
+    "vshrn.s32       d3, q10, #9              \n"
+
+    // part 2
+    // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
+    "vtrn.32         d0, d2                   \n"
+    "vtrn.32         d1, d3                   \n"
+    "vtrn.16         d0, d1                   \n"
+    "vtrn.16         d2, d3                   \n"
+
+    "vmov.s16        d26, #7                  \n"
+
+    "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
+    "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
+    "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
+    "vadd.s16        d4, d4, d26              \n" // a1 + 7
+    "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
+
+    "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
+    "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
+
+    "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
+    "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
+
+    "vceq.s16        d4, d7, #0               \n"
+
+    "vshr.s16        d0, d0, #4               \n"
+    "vshr.s16        d2, d2, #4               \n"
+
+    "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
+    "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
+
+    "vmvn.s16        d4, d4                   \n"
+    // op[4] = (c1*2217 + d1*5352 + 12000)>>16
+    "vshrn.s32       d1, q11, #16             \n"
+    // op[4] += (d1!=0)
+    "vsub.s16        d1, d1, d4               \n"
+    // op[12]= (d1*2217 - c1*5352 + 51000)>>16
+    "vshrn.s32       d3, q12, #16             \n"
+
+    // set result to out array
+    "vst1.16         {q0, q1}, [%[out]]   \n"
+    : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
+      [coeff32] "+r"(coeff32)          // modified registers
+    : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
+      [out] "r"(out)                   // constants
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13"       // clobbered
+  );
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  const int kStep = 32;
+  __asm__ volatile (
+    // d0 = in[0 * 16] , d1 = in[1 * 16]
+    // d2 = in[2 * 16] , d3 = in[3 * 16]
+    "vld1.16         d0[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[0], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[1], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[2], [%[in]], %[kStep]   \n"
+    "vld1.16         d0[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d1[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d2[3], [%[in]], %[kStep]   \n"
+    "vld1.16         d3[3], [%[in]], %[kStep]   \n"
+
+    "vaddl.s16       q2, d0, d2                 \n"
+    "vshl.s32        q2, q2, #2                 \n" // a0=(in[0*16]+in[2*16])<<2
+    "vaddl.s16       q3, d1, d3                 \n"
+    "vshl.s32        q3, q3, #2                 \n" // a1=(in[1*16]+in[3*16])<<2
+    "vsubl.s16       q4, d1, d3                 \n"
+    "vshl.s32        q4, q4, #2                 \n" // a2=(in[1*16]-in[3*16])<<2
+    "vsubl.s16       q5, d0, d2                 \n"
+    "vshl.s32        q5, q5, #2                 \n" // a3=(in[0*16]-in[2*16])<<2
+
+    "vceq.s32        q10, q2, #0                \n"
+    "vmvn.s32        q10, q10                   \n" // (a0 != 0)
+    "vqadd.s32       q6, q2, q3                 \n" // (a0 + a1)
+    "vqsub.s32       q6, q6, q10                \n" // (a0 + a1) + (a0 != 0)
+    "vqadd.s32       q7, q5, q4                 \n" // a3 + a2
+    "vqsub.s32       q8, q5, q4                 \n" // a3 - a2
+    "vqsub.s32       q9, q2, q3                 \n" // a0 - a1
+
+    // Transpose
+    // q6 = tmp[0, 1,  2,  3] ; q7 = tmp[ 4,  5,  6,  7]
+    // q8 = tmp[8, 9, 10, 11] ; q9 = tmp[12, 13, 14, 15]
+    "vswp            d13, d16                   \n" // vtrn.64 q0, q2
+    "vswp            d15, d18                   \n" // vtrn.64 q1, q3
+    "vtrn.32         q6, q7                     \n"
+    "vtrn.32         q8, q9                     \n"
+
+    "vqadd.s32       q0, q6, q8                 \n" // a0 = tmp[0] + tmp[8]
+    "vqadd.s32       q1, q7, q9                 \n" // a1 = tmp[4] + tmp[12]
+    "vqsub.s32       q2, q7, q9                 \n" // a2 = tmp[4] - tmp[12]
+    "vqsub.s32       q3, q6, q8                 \n" // a3 = tmp[0] - tmp[8]
+
+    "vqadd.s32       q4, q0, q1                 \n" // b0 = a0 + a1
+    "vqadd.s32       q5, q3, q2                 \n" // b1 = a3 + a2
+    "vqsub.s32       q6, q3, q2                 \n" // b2 = a3 - a2
+    "vqsub.s32       q7, q0, q1                 \n" // b3 = a0 - a1
+
+    "vmov.s32         q0, #3                    \n" // q0 = 3
+
+    "vcgt.s32        q1, q4, #0                 \n" // (b0>0)
+    "vqsub.s32       q2, q4, q1                 \n" // (b0+(b0>0))
+    "vqadd.s32       q3, q2, q0                 \n" // (b0+(b0>0)+3)
+    "vshrn.s32       d18, q3, #3                \n" // (b0+(b0>0)+3) >> 3
+
+    "vcgt.s32        q1, q5, #0                 \n" // (b1>0)
+    "vqsub.s32       q2, q5, q1                 \n" // (b1+(b1>0))
+    "vqadd.s32       q3, q2, q0                 \n" // (b1+(b1>0)+3)
+    "vshrn.s32       d19, q3, #3                \n" // (b1+(b1>0)+3) >> 3
+
+    "vcgt.s32        q1, q6, #0                 \n" // (b2>0)
+    "vqsub.s32       q2, q6, q1                 \n" // (b2+(b2>0))
+    "vqadd.s32       q3, q2, q0                 \n" // (b2+(b2>0)+3)
+    "vshrn.s32       d20, q3, #3                \n" // (b2+(b2>0)+3) >> 3
+
+    "vcgt.s32        q1, q7, #0                 \n" // (b3>0)
+    "vqsub.s32       q2, q7, q1                 \n" // (b3+(b3>0))
+    "vqadd.s32       q3, q2, q0                 \n" // (b3+(b3>0)+3)
+    "vshrn.s32       d21, q3, #3                \n" // (b3+(b3>0)+3) >> 3
+
+    "vst1.16         {q9, q10}, [%[out]]        \n"
+
+    : [in] "+r"(in)
+    : [kStep] "r"(kStep), [out] "r"(out)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5",
+      "q6", "q7", "q8", "q9", "q10"       // clobbered
+  ) ;
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// This uses a TTransform helper function in C
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int kBPS = BPS;
+  const uint8_t* A = a;
+  const uint8_t* B = b;
+  const uint16_t* W = w;
+  int sum;
+  __asm__ volatile (
+    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
+    "vld1.32         d2[1], [%[a]]            \n"
+
+    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
+    "vld1.32         d3[1], [%[b]]            \n"
+
+    // a d0/d2, b d1/d3
+    // d0/d1: 01 01 01 01
+    // d2/d3: 23 23 23 23
+    // But: it goes 01 45 23 67
+    // Notice the middle values are transposed
+    "vtrn.16         q0, q1                   \n"
+
+    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+    "vaddl.u8        q2, d0, d2               \n"
+    "vaddl.u8        q10, d1, d3              \n"
+    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+    "vsubl.u8        q3, d0, d2               \n"
+    "vsubl.u8        q11, d1, d3              \n"
+
+    // tmp[0] = a0 + a1
+    "vpaddl.s16      q0, q2                   \n"
+    "vpaddl.s16      q8, q10                  \n"
+
+    // tmp[1] = a3 + a2
+    "vpaddl.s16      q1, q3                   \n"
+    "vpaddl.s16      q9, q11                  \n"
+
+    // No pair subtract
+    // q2 = {a0, a3}
+    // q3 = {a1, a2}
+    "vtrn.16         q2, q3                   \n"
+    "vtrn.16         q10, q11                 \n"
+
+    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
+    "vsubl.s16       q12, d4, d6              \n"
+    "vsubl.s16       q13, d5, d7              \n"
+    "vsubl.s16       q14, d20, d22            \n"
+    "vsubl.s16       q15, d21, d23            \n"
+
+    // separate tmp[3] and tmp[2]
+    // q12 = tmp[3]
+    // q13 = tmp[2]
+    "vtrn.32         q12, q13                 \n"
+    "vtrn.32         q14, q15                 \n"
+
+    // Transpose tmp for a
+    "vswp            d1, d26                  \n" // vtrn.64
+    "vswp            d3, d24                  \n" // vtrn.64
+    "vtrn.32         q0, q1                   \n"
+    "vtrn.32         q13, q12                 \n"
+
+    // Transpose tmp for b
+    "vswp            d17, d30                 \n" // vtrn.64
+    "vswp            d19, d28                 \n" // vtrn.64
+    "vtrn.32         q8, q9                   \n"
+    "vtrn.32         q15, q14                 \n"
+
+    // The first Q register is a, the second b.
+    // q0/8 tmp[0-3]
+    // q13/15 tmp[4-7]
+    // q1/9 tmp[8-11]
+    // q12/14 tmp[12-15]
+
+    // These are still in 01 45 23 67 order. We fix it easily in the addition
+    // case but the subtraction propegates them.
+    "vswp            d3, d27                  \n"
+    "vswp            d19, d31                 \n"
+
+    // a0 = tmp[0] + tmp[8]
+    "vadd.s32        q2, q0, q1               \n"
+    "vadd.s32        q3, q8, q9               \n"
+
+    // a1 = tmp[4] + tmp[12]
+    "vadd.s32        q10, q13, q12            \n"
+    "vadd.s32        q11, q15, q14            \n"
+
+    // a2 = tmp[4] - tmp[12]
+    "vsub.s32        q13, q13, q12            \n"
+    "vsub.s32        q15, q15, q14            \n"
+
+    // a3 = tmp[0] - tmp[8]
+    "vsub.s32        q0, q0, q1               \n"
+    "vsub.s32        q8, q8, q9               \n"
+
+    // b0 = a0 + a1
+    "vadd.s32        q1, q2, q10              \n"
+    "vadd.s32        q9, q3, q11              \n"
+
+    // b1 = a3 + a2
+    "vadd.s32        q12, q0, q13             \n"
+    "vadd.s32        q14, q8, q15             \n"
+
+    // b2 = a3 - a2
+    "vsub.s32        q0, q0, q13              \n"
+    "vsub.s32        q8, q8, q15              \n"
+
+    // b3 = a0 - a1
+    "vsub.s32        q2, q2, q10              \n"
+    "vsub.s32        q3, q3, q11              \n"
+
+    "vld1.64         {q10, q11}, [%[w]]       \n"
+
+    // abs(b0)
+    "vabs.s32        q1, q1                   \n"
+    "vabs.s32        q9, q9                   \n"
+    // abs(b1)
+    "vabs.s32        q12, q12                 \n"
+    "vabs.s32        q14, q14                 \n"
+    // abs(b2)
+    "vabs.s32        q0, q0                   \n"
+    "vabs.s32        q8, q8                   \n"
+    // abs(b3)
+    "vabs.s32        q2, q2                   \n"
+    "vabs.s32        q3, q3                   \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d20                 \n"
+    "vmovl.u16       q15, d21                 \n"
+
+    // w[0] * abs(b0)
+    "vmul.u32        q1, q1, q13              \n"
+    "vmul.u32        q9, q9, q13              \n"
+
+    // w[4] * abs(b1)
+    "vmla.u32        q1, q12, q15             \n"
+    "vmla.u32        q9, q14, q15             \n"
+
+    // expand w before using.
+    "vmovl.u16       q13, d22                 \n"
+    "vmovl.u16       q15, d23                 \n"
+
+    // w[8] * abs(b1)
+    "vmla.u32        q1, q0, q13              \n"
+    "vmla.u32        q9, q8, q13              \n"
+
+    // w[12] * abs(b1)
+    "vmla.u32        q1, q2, q15              \n"
+    "vmla.u32        q9, q3, q15              \n"
+
+    // Sum the arrays
+    "vpaddl.u32      q1, q1                   \n"
+    "vpaddl.u32      q9, q9                   \n"
+    "vadd.u64        d2, d3                   \n"
+    "vadd.u64        d18, d19                 \n"
+
+    // Hadamard transform needs 4 bits of extra precision (2 bits in each
+    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
+    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
+    // 16bits for w[] + 2 bits of abs() summation.
+    //
+    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
+    // A-OK.
+
+    // sum2 - sum1
+    "vsub.u32        d0, d2, d18              \n"
+    // abs(sum2 - sum1)
+    "vabs.s32        d0, d0                   \n"
+    // abs(sum2 - sum1) >> 5
+    "vshr.u32        d0, #5                   \n"
+
+    // It would be better to move the value straight into r0 but I'm not
+    // entirely sure how this works with inline assembly.
+    "vmov.32         %[sum], d0[0]            \n"
+
+    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
+    : [kBPS] "r"(kBPS)
+    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
+  ) ;
+
+  return sum;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+#endif   // WEBP_USE_NEON
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitNEON(void);
+
+void VP8EncDspInitNEON(void) {
+#if defined(WEBP_USE_NEON)
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+
+  VP8ITransformWHT = ITransformWHT;
+  VP8FTransformWHT = FTransformWHT;
+
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+#endif   // WEBP_USE_NEON
+}
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
--- a/Show More
+++ b/Show More