diff --git a/PATENTS b/PATENTS
index 4414d8385..79d17d7d6 100644
--- a/PATENTS
+++ b/PATENTS
@@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
+------------------------------------
 
-"This implementation" means the copyrightable works distributed by
-Google as part of the WebM Project.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
 
-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of VP8, where such license applies only to those patent
-claims, both currently owned by Google and acquired in the future,
-licensable by Google that are necessarily infringed by this
-implementation of VP8. This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation. If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of VP8 or any code incorporated within this
-implementation of VP8 constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of VP8
-shall terminate as of the date such litigation is filed.
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitutes direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/build/make/Makefile b/build/make/Makefile
index dc61429a9..ed90397f0 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -118,20 +118,26 @@ testdata::
 utiltest:
 
 # Add compiler flags for intrinsic files
+ifeq ($(TOOLCHAIN), x86-os2-gcc)
+STACKREALIGN=-mstackrealign
+else
+STACKREALIGN=
+endif
+
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -196,13 +202,13 @@ $(BUILD_PFX)%.asm.s: %.asm
 # the copy implementation
 HAVE_GNU_STRIP := $(if $(CONFIG_DEBUG),,$(HAVE_GNU_STRIP))
 ifeq ($(HAVE_GNU_STRIP),yes)
-# Older binutils strip global sybols not needed for relocation processing
-# when given --strip-unneeded. Use nm and awk to identify globals and
-# keep them.
+# Older binutils strip global symbols not needed for relocation processing
+# when given --strip-unneeded. Using nm and awk to identify globals and
+# keep them caused command line length issues under mingw and segfaults in
+# test_libvpx were observed under OS/2: simply use --strip-debug.
 %.a: %_g.a
 	$(if $(quiet),@echo "    [STRIP] $@ < $<")
-	$(qexec)$(STRIP) --strip-unneeded \
-         `$(NM) $< | grep ' [A-TV-Z] ' | awk '{print "-K"$$3'}`\
+	$(qexec)$(STRIP) --strip-debug \
           -o $@ $<
 else
 %.a: %_g.a
diff --git a/build/make/configure.sh b/build/make/configure.sh
old mode 100755
new mode 100644
index f22e3e0ac..ab6687f73
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -252,7 +252,7 @@ tolower(){
 #
 source_path=${0%/*}
 enable_feature source_path_used
-if test -z "$source_path" -o "$source_path" = "." ; then
+if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
     source_path="`pwd`"
     disable_feature source_path_used
 fi
@@ -381,8 +381,8 @@ EOF
 
 # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
 check_gcc_machine_option() {
-    local opt="$1"
-    local feature="$2"
+    opt="$1"
+    feature="$2"
     [ -n "$feature" ] || feature="$opt"
 
     if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
@@ -419,8 +419,8 @@ true
 }
 
 write_common_target_config_mk() {
-    local CC="${CC}"
-    local CXX="${CXX}"
+    saved_CC="${CC}"
+    saved_CXX="${CXX}"
     enabled ccache && CC="ccache ${CC}"
     enabled ccache && CXX="ccache ${CXX}"
     print_webm_license $1 "##" ""
@@ -470,6 +470,8 @@ EOF
 
     enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
 
+    CC="${saved_CC}"
+    CXX="${saved_CXX}"
 }
 
 
@@ -547,7 +549,8 @@ process_common_cmdline() {
         alt_libc="${optval}"
         ;;
         --as=*)
-        [ "${optval}" = yasm -o "${optval}" = nasm -o "${optval}" = auto ] \
+        [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
+            || [ "${optval}" = auto ] \
             || die "Must be yasm, nasm or auto: ${optval}"
         alt_as="${optval}"
         ;;
@@ -555,8 +558,8 @@ process_common_cmdline() {
         w="${optval%%x*}"
         h="${optval##*x}"
         VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
-        [ ${w} -gt 0 -a ${h} -gt 0 ] || die "Invalid size-limit: too small."
-        [ ${w} -lt 65536 -a ${h} -lt 65536 ] \
+        [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small."
+        [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \
             || die "Invalid size-limit: too big."
         enable_feature size_limit
         ;;
@@ -1150,7 +1153,7 @@ EOF
             auto|"")
                 which nasm >/dev/null 2>&1 && AS=nasm
                 which yasm >/dev/null 2>&1 && AS=yasm
-                [ "${AS}" = auto -o -z "${AS}" ] \
+                [ "${AS}" = auto ] || [ -z "${AS}" ] \
                     && die "Neither yasm nor nasm have been found"
             ;;
         esac
@@ -1222,7 +1225,12 @@ EOF
         fi
     fi
 
-    enabled debug && check_add_cflags -g && check_add_ldflags -g
+    if enabled debug; then
+        check_add_cflags -g && check_add_ldflags -g
+    else
+        check_add_cflags -DNDEBUG
+    fi
+
     enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
     enabled gcov &&
         check_add_cflags -fprofile-arcs -ftest-coverage &&
@@ -1309,8 +1317,9 @@ process_toolchain() {
 }
 
 print_config_mk() {
-    local prefix=$1
-    local makefile=$2
+    saved_prefix="${prefix}"
+    prefix=$1
+    makefile=$2
     shift 2
     for cfg; do
         if enabled $cfg; then
@@ -1318,11 +1327,13 @@ print_config_mk() {
             echo "${prefix}_${upname}=yes" >> $makefile
         fi
     done
+    prefix="${saved_prefix}"
 }
 
 print_config_h() {
-    local prefix=$1
-    local header=$2
+    saved_prefix="${prefix}"
+    prefix=$1
+    header=$2
     shift 2
     for cfg; do
         upname="`toupper $cfg`"
@@ -1332,10 +1343,11 @@ print_config_h() {
             echo "#define ${prefix}_${upname} 0" >> $header
         fi
     done
+    prefix="${saved_prefix}"
 }
 
 print_config_vars_h() {
-    local header=$1
+    header=$1
     shift
     while [ $# -gt 0 ]; do
         upname="`toupper $1`"
@@ -1345,9 +1357,10 @@ print_config_vars_h() {
 }
 
 print_webm_license() {
-    local destination=$1
-    local prefix="$2"
-    local suffix="$3"
+    saved_prefix="${prefix}"
+    destination=$1
+    prefix="$2"
+    suffix="$3"
     shift 3
     cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1358,6 +1371,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
 ${prefix} in the file PATENTS.  All contributing project authors may${suffix}
 ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
 EOF
+    prefix="${saved_prefix}"
 }
 
 process_targets() {
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index f5f59b146..28ef69c23 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -3,7 +3,7 @@
 no strict 'refs';
 use warnings;
 use Getopt::Long;
-Getopt::Long::Configure("auto_help");
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
 
 my %ALL_FUNCS = ();
 my @ALL_ARCHS;
@@ -385,6 +385,8 @@ if ($opts{arch} eq 'x86') {
   arm;
 } elsif ($opts{arch} eq 'armv7') {
   @ALL_ARCHS = filter(qw/edsp media neon_asm neon/);
+  @REQUIRES = filter(keys %required ? keys %required : qw/media/);
+  &require(@REQUIRES);
   arm;
 } elsif ($opts{arch} eq 'armv8') {
   @ALL_ARCHS = filter(qw/neon/);
diff --git a/configure b/configure
index d5be3a2fd..2ac43c109 100755
--- a/configure
+++ b/configure
@@ -25,6 +25,7 @@ Advanced options:
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
+  ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
@@ -45,6 +46,9 @@ Advanced options:
   ${toggle_realtime_only}         enable this option while building for real-time encoding
   ${toggle_onthefly_bitpacking}   enable on-the-fly bitpacking in real-time encoding
   ${toggle_error_concealment}     enable this option to get a decoder which is able to conceal losses
+  ${toggle_coefficient_range_checking}
+                                  enable decoder to check if intermediate
+                                  transform coefficients are in valid range
   ${toggle_runtime_cpu_detect}    runtime cpu detection
   ${toggle_shared}                shared library support
   ${toggle_static}                static library support
@@ -66,10 +70,10 @@ Codecs:
 EOF
 #restore editor state '
 
-    local family;
-    local last_family;
-    local c;
-    local str;
+    family="";
+    last_family="";
+    c="";
+    str="";
     for c in ${CODECS}; do
         family=${c%_*}
         if [ "${family}" != "${last_family}" ]; then
@@ -271,12 +275,10 @@ HAVE_LIST="
     unistd_h
 "
 EXPERIMENT_LIST="
-    alpha
     vp9_high
-    multiple_arf
     spatial_svc
     high_quant
-    denoising
+    vp9_temporal_denoising
     emulate_hw_high
     fp_mb_stats
 "
@@ -328,8 +330,10 @@ CONFIG_LIST="
     webm_io
     libyuv
     decode_perf_tests
+    encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    coefficient_range_checking
     experimental
     size_limit
     ${EXPERIMENT_LIST}
@@ -384,8 +388,10 @@ CMDLINE_SELECT="
     webm_io
     libyuv
     decode_perf_tests
+    encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    coefficient_range_checking
     experimental
 "
 
@@ -413,7 +419,7 @@ process_cmdline() {
 }
 
 post_process_cmdline() {
-    local c
+    c=""
 
     # If the codec family is disabled, disable all components of that family.
     # If the codec family is enabled, enable all components of that family.
@@ -460,8 +466,8 @@ process_targets() {
     enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
 
     # Calculate the default distribution name, based on the enabled features
-    local cf
-    local DIST_DIR=vpx
+    cf=""
+    DIST_DIR=vpx
     for cf in $CODEC_FAMILIES; do
         if enabled ${cf}_encoder && enabled ${cf}_decoder; then
             DIST_DIR="${DIST_DIR}-${cf}"
@@ -483,7 +489,7 @@ process_targets() {
           ;;
     esac
     if [ -f "${source_path}/build/make/version.sh" ]; then
-        local ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
+        ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
         DIST_DIR="${DIST_DIR}-${ver}"
         VERSION_STRING=${ver}
         ver=${ver%%-*}
@@ -517,7 +523,7 @@ EOF
     # Write makefiles for all enabled targets
     #
     for tgt in libs examples docs solution; do
-        local tgt_fn="$tgt-$toolchain.mk"
+        tgt_fn="$tgt-$toolchain.mk"
 
         if enabled $tgt; then
             echo "Creating makefiles for ${toolchain} ${tgt}"
@@ -556,7 +562,7 @@ process_detect() {
                     true;
                 ;;
                 *)
-                    local result=false
+                    result=false
                     for d in "$@"; do
                         [ -f "${d##-I}/$header" ] && result=true && break
                     done
@@ -605,7 +611,7 @@ process_toolchain() {
     # Handle universal binaries for this architecture
     case $toolchain in
         universal-darwin*)
-            local darwin_ver=${tgt_os##darwin}
+            darwin_ver=${tgt_os##darwin}
 
             # Snow Leopard (10.6/darwin10) dropped support for PPC
             # Include PPC support for all prior versions
diff --git a/examples.mk b/examples.mk
index a47db04ae..91bd45aa4 100644
--- a/examples.mk
+++ b/examples.mk
@@ -9,8 +9,12 @@
 ##
 
 LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
+                third_party/libyuv/include/libyuv/convert.h \
+                third_party/libyuv/include/libyuv/convert_argb.h \
+                third_party/libyuv/include/libyuv/convert_from.h \
                 third_party/libyuv/include/libyuv/cpu_id.h  \
                 third_party/libyuv/include/libyuv/planar_functions.h  \
+                third_party/libyuv/include/libyuv/rotate.h  \
                 third_party/libyuv/include/libyuv/row.h  \
                 third_party/libyuv/include/libyuv/scale.h  \
                 third_party/libyuv/include/libyuv/scale_row.h  \
@@ -20,14 +24,15 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/row_common.cc \
                 third_party/libyuv/source/row_mips.cc \
                 third_party/libyuv/source/row_neon.cc \
+                third_party/libyuv/source/row_neon64.cc \
                 third_party/libyuv/source/row_posix.cc \
                 third_party/libyuv/source/row_win.cc \
-                third_party/libyuv/source/scale.cc  \
+                third_party/libyuv/source/scale.cc \
                 third_party/libyuv/source/scale_common.cc \
                 third_party/libyuv/source/scale_mips.cc \
                 third_party/libyuv/source/scale_neon.cc \
                 third_party/libyuv/source/scale_posix.cc \
-                third_party/libyuv/source/scale_win.cc
+                third_party/libyuv/source/scale_win.cc \
 
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
                       third_party/libwebm/mkvmuxerutil.cpp \
@@ -210,17 +215,18 @@ endif
 # from an installed tree or a version controlled tree. Determine
 # the proper paths.
 ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
-    LIB_PATH := $(SRC_PATH_BARE)/../lib
-    INC_PATH := $(SRC_PATH_BARE)/../include
+    LIB_PATH-yes := $(SRC_PATH_BARE)/../lib
+    INC_PATH-yes := $(SRC_PATH_BARE)/../include
 else
     LIB_PATH-yes                     += $(if $(BUILD_PFX),$(BUILD_PFX),.)
     INC_PATH-$(CONFIG_VP8_DECODER)   += $(SRC_PATH_BARE)/vp8
     INC_PATH-$(CONFIG_VP8_ENCODER)   += $(SRC_PATH_BARE)/vp8
     INC_PATH-$(CONFIG_VP9_DECODER)   += $(SRC_PATH_BARE)/vp9
     INC_PATH-$(CONFIG_VP9_ENCODER)   += $(SRC_PATH_BARE)/vp9
-    LIB_PATH := $(call enabled,LIB_PATH)
-    INC_PATH := $(call enabled,INC_PATH)
 endif
+INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include
+LIB_PATH := $(call enabled,LIB_PATH)
+INC_PATH := $(call enabled,INC_PATH)
 INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
 INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
 
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index dc9bc06b1..4c1b60bad 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -118,11 +118,12 @@ void usage_exit() {
   exit(EXIT_FAILURE);
 }
 
-static void encode_frame(vpx_codec_ctx_t *codec,
-                         vpx_image_t *img,
-                         int frame_index,
-                         int flags,
-                         VpxVideoWriter *writer) {
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
   vpx_codec_iter_t iter = NULL;
   const vpx_codec_cx_pkt_t *pkt = NULL;
   const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
@@ -131,6 +132,8 @@ static void encode_frame(vpx_codec_ctx_t *codec,
     die_codec(codec, "Failed to encode frame");
 
   while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
     if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
       const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
       if (!vpx_video_writer_write_frame(writer,
@@ -139,11 +142,12 @@ static void encode_frame(vpx_codec_ctx_t *codec,
                                         pkt->data.frame.pts)) {
         die_codec(codec, "Failed to write compressed frame");
       }
-
       printf(keyframe ? "K" : ".");
       fflush(stdout);
     }
   }
+
+  return got_pkts;
 }
 
 int main(int argc, char **argv) {
@@ -230,13 +234,16 @@ int main(int argc, char **argv) {
   if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
     die_codec(&codec, "Failed to initialize encoder");
 
+  // Encode frames.
   while (vpx_img_read(&raw, infile)) {
     int flags = 0;
     if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
       flags |= VPX_EFLAG_FORCE_KF;
     encode_frame(&codec, &raw, frame_count++, flags, writer);
   }
-  encode_frame(&codec, NULL, -1, 0, writer);  // flush the encoder
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
 
   printf("\n");
   fclose(infile);
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 7872048b1..2720088b3 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -636,7 +636,7 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYUV);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
       vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
diff --git a/libs.mk b/libs.mk
index 1e01639c7..25fbc2cb9 100644
--- a/libs.mk
+++ b/libs.mk
@@ -409,12 +409,16 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
             curl -L -o $@ $(call libvpx_test_data_url,$(@F))
 
 testdata:: $(LIBVPX_TEST_DATA)
-	$(qexec)if [ -x "$$(which sha1sum)" ]; then\
+	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
+          [ -x "$$(which shasum)" ] && sha1sum=shasum;\
+          [ -x "$$(which sha1)" ] && sha1sum=sha1;\
+          if [ -n "$${sha1sum}" ]; then\
+            set -e;\
             echo "Checking test data:";\
             if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
                 for f in $(call enabled,LIBVPX_TEST_DATA); do\
                     grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                        (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+                        (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
                 done; \
             fi; \
         else\
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 1456791a1..fdc722797 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -21,29 +21,28 @@
 #include "vpx_ports/mem.h"
 
 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int filter_x_stride,
-                              const int16_t *filter_y, int filter_y_stride,
-                              int w, int h);
+typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h);
 
 struct ConvolveFunctions {
-  ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg,
-                    convolve_fn_t v8, convolve_fn_t v8_avg,
-                    convolve_fn_t hv8, convolve_fn_t hv8_avg, int bps)
+  ConvolveFunctions(ConvolveFunc h8, ConvolveFunc h8_avg,
+                    ConvolveFunc v8, ConvolveFunc v8_avg,
+                    ConvolveFunc hv8, ConvolveFunc hv8_avg, int bps)
       : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),
         hv8_avg_(hv8_avg), use_high_bps_(bps) {}
-
-  convolve_fn_t h8_;
-  convolve_fn_t v8_;
-  convolve_fn_t hv8_;
-  convolve_fn_t h8_avg_;
-  convolve_fn_t v8_avg_;
-  convolve_fn_t hv8_avg_;
+  ConvolveFunc h8_;
+  ConvolveFunc v8_;
+  ConvolveFunc hv8_;
+  ConvolveFunc h8_avg_;
+  ConvolveFunc v8_avg_;
+  ConvolveFunc hv8_avg_;
   int use_high_bps_;
 };
 
-typedef std::tr1::tuple<int, int, const ConvolveFunctions*> convolve_param_t;
+typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
@@ -171,14 +170,14 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
 }
 
 void high_filter_block2d_8_c(const uint16_t *src_ptr,
-                        const unsigned int src_stride,
-                        const int16_t *HFilter,
-                        const int16_t *VFilter,
-                        uint16_t *dst_ptr,
-                        unsigned int dst_stride,
-                        unsigned int output_width,
-                        unsigned int output_height,
-                        int bps) {
+                             const unsigned int src_stride,
+                             const int16_t *HFilter,
+                             const int16_t *VFilter,
+                             uint16_t *dst_ptr,
+                             unsigned int dst_stride,
+                             unsigned int output_width,
+                             unsigned int output_height,
+                             int bps) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
@@ -255,12 +254,12 @@ void high_filter_block2d_8_c(const uint16_t *src_ptr,
 }
 
 void high_block2d_average_c(uint16_t *src,
-                       unsigned int src_stride,
-                       uint16_t *output_ptr,
-                       unsigned int output_stride,
-                       unsigned int output_width,
-                       unsigned int output_height,
-                       int bps) {
+                            unsigned int src_stride,
+                            uint16_t *output_ptr,
+                            unsigned int output_stride,
+                            unsigned int output_width,
+                            unsigned int output_height,
+                            int bps) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -271,14 +270,14 @@ void high_block2d_average_c(uint16_t *src,
 }
 
 void high_filter_average_block2d_8_c(const uint16_t *src_ptr,
-                                const unsigned int src_stride,
-                                const int16_t *HFilter,
-                                const int16_t *VFilter,
-                                uint16_t *dst_ptr,
-                                unsigned int dst_stride,
-                                unsigned int output_width,
-                                unsigned int output_height,
-                                int bps) {
+                                     const unsigned int src_stride,
+                                     const int16_t *HFilter,
+                                     const int16_t *VFilter,
+                                     uint16_t *dst_ptr,
+                                     unsigned int dst_stride,
+                                     unsigned int output_width,
+                                     unsigned int output_height,
+                                     int bps) {
   uint16_t tmp[64 * 64];
 
   assert(output_width <= 64);
@@ -289,7 +288,7 @@ void high_filter_average_block2d_8_c(const uint16_t *src_ptr,
                     output_width, output_height, bps);
 }
 
-class ConvolveTest : public ::testing::TestWithParam<convolve_param_t> {
+class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
  public:
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 7da7b80aa..4e8019a87 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <cmath>
+#include <map>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -24,6 +25,28 @@ const unsigned int kCQTargetBitrate = 2000;
 
 class CQTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  // maps the cqlevel to the bitrate produced.
+  typedef std::map<int, uint32_t> BitrateMap;
+
+  static void SetUpTestCase() {
+    bitrates_.clear();
+  }
+
+  static void TearDownTestCase() {
+    ASSERT_TRUE(!HasFailure())
+        << "skipping bitrate validation due to earlier failure.";
+    uint32_t prev_actual_bitrate = kCQTargetBitrate;
+    for (BitrateMap::const_iterator iter = bitrates_.begin();
+         iter != bitrates_.end(); ++iter) {
+      const uint32_t cq_actual_bitrate = iter->second;
+      EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate)
+          << "cq_level: " << iter->first
+          << ", bitrate should decrease with increase in CQ level.";
+      prev_actual_bitrate = cq_actual_bitrate;
+    }
+  }
+
  protected:
   CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
@@ -66,9 +89,12 @@ class CQTest : public ::libvpx_test::EncoderTest,
     return pow(10.0, avg_psnr / 10.0) / file_size_;
   }
 
+  int cq_level() const { return cq_level_; }
   size_t file_size() const { return file_size_; }
   int n_frames() const { return n_frames_; }
 
+  static BitrateMap bitrates_;
+
  private:
   int cq_level_;
   size_t file_size_;
@@ -76,7 +102,8 @@ class CQTest : public ::libvpx_test::EncoderTest,
   int n_frames_;
 };
 
-unsigned int prev_actual_bitrate = kCQTargetBitrate;
+CQTest::BitrateMap CQTest::bitrates_;
+
 TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
@@ -91,8 +118,7 @@ TEST_P(CQTest, LinearPSNRIsHigherForCQLevel) {
   const unsigned int cq_actual_bitrate =
       static_cast<unsigned int>(file_size()) * 8 * 30 / (n_frames() * 1000);
   EXPECT_LE(cq_actual_bitrate, kCQTargetBitrate);
-  EXPECT_LE(cq_actual_bitrate, prev_actual_bitrate);
-  prev_actual_bitrate = cq_actual_bitrate;
+  bitrates_[cq_level()] = cq_actual_bitrate;
 
   // try targeting the approximate same bitrate with VBR mode
   cfg_.rc_end_usage = VPX_VBR;
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index fdc685804..94fa08331 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -259,15 +259,15 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
   }
 }
 
-typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride,
-                       int tx_type);
-typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride,
-                       int tx_type);
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
 
-typedef std::tr1::tuple<fdct_t, idct_t, int, int> dct_16x16_param_t;
-typedef std::tr1::tuple<fht_t, iht_t, int, int> ht_16x16_param_t;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, int> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, int> Ht16x16Param;
 
 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                    int tx_type) {
@@ -550,15 +550,15 @@ class Trans16x16TestBase {
   }
   int pitch_;
   int tx_type_;
-  fht_t fwd_txfm_ref;
   int bit_depth_;
   int mask_;
-  iht_t inv_txfm_ref;
+  FhtFunc fwd_txfm_ref;
+  IhtFunc inv_txfm_ref;
 };
 
 class Trans16x16DCT
     : public Trans16x16TestBase,
-      public ::testing::TestWithParam<dct_16x16_param_t> {
+      public ::testing::TestWithParam<Dct16x16Param> {
  public:
   virtual ~Trans16x16DCT() {}
 
@@ -596,8 +596,8 @@ class Trans16x16DCT
     inv_txfm_(out, dst, stride);
   }
 
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
 };
 
 TEST_P(Trans16x16DCT, AccuracyCheck) {
@@ -624,7 +624,7 @@ TEST_P(Trans16x16DCT, InvAccuracyCheck) {
 
 class Trans16x16HT
     : public Trans16x16TestBase,
-      public ::testing::TestWithParam<ht_16x16_param_t> {
+      public ::testing::TestWithParam<Ht16x16Param> {
  public:
   virtual ~Trans16x16HT() {}
 
@@ -662,8 +662,8 @@ class Trans16x16HT
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
-  fht_t fwd_txfm_;
-  iht_t inv_txfm_;
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
 };
 
 TEST_P(Trans16x16HT, AccuracyCheck) {
@@ -755,29 +755,4 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0, 8)));
 #endif
-
-#if HAVE_AVX2 && !CONFIG_VP9_HIGH
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride);
-void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride,
-                       int tx_type);
-}
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AVX2, Trans16x16DCT,
-    ::testing::Values(
-        make_tuple(&vp9_fdct16x16_avx2,
-                   &vp9_idct16x16_256_add_c, 0, 8)));
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3, 8)));
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AVX2, Trans16x16HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0, 8),
-        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1, 8),
-        make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2, 8)));
-#endif
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 597a808ec..a05e09826 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -71,10 +71,10 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
   }
 }
 
-typedef void (*fwd_txfm_t)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*inv_txfm_t)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 
-typedef std::tr1::tuple<fwd_txfm_t, inv_txfm_t, int, int> trans_32x32_param_t;
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, int> Trans32x32Param;
 
 #if CONFIG_VP9_HIGH
 
@@ -88,7 +88,7 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 
 #endif
 
-class Trans32x32Test : public ::testing::TestWithParam<trans_32x32_param_t> {
+class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
  public:
   virtual ~Trans32x32Test() {}
   virtual void SetUp() {
@@ -106,8 +106,8 @@ class Trans32x32Test : public ::testing::TestWithParam<trans_32x32_param_t> {
   int version_;
   int bit_depth_;
   int mask_;
-  fwd_txfm_t fwd_txfm_;
-  inv_txfm_t inv_txfm_;
+  FwdTxfmFunc fwd_txfm_;
+  InvTxfmFunc inv_txfm_;
 };
 
 TEST_P(Trans32x32Test, AccuracyCheck) {
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
new file mode 100644
index 000000000..2837f8cbe
--- /dev/null
+++ b/test/decode_api_test.cc
@@ -0,0 +1,126 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/ivf_video_source.h"
+#include "./vpx_config.h"
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+
+namespace {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+TEST(DecodeAPI, InvalidParams) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_DECODER
+    &vpx_codec_vp8_dx_algo,
+#endif
+#if CONFIG_VP9_DECODER
+    &vpx_codec_vp9_dx_algo,
+#endif
+  };
+  uint8_t buf[1] = {0};
+  vpx_codec_ctx_t dec;
+
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_dec_init(&dec, NULL, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, NULL, 0, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_decode(NULL, buf, 0, NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_decode(NULL, buf, NELEMENTS(buf), NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_decode(NULL, NULL, NELEMENTS(buf), NULL, 0));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, vpx_codec_destroy(NULL));
+  EXPECT_TRUE(vpx_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_dec_init(NULL, kCodecs[i], NULL, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+    EXPECT_EQ(VPX_CODEC_UNSUP_BITSTREAM,
+              vpx_codec_decode(&dec, buf, NELEMENTS(buf), NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_decode(&dec, NULL, NELEMENTS(buf), NULL, 0));
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_decode(&dec, buf, 0, NULL, 0));
+
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+  }
+}
+
+#if CONFIG_VP9_DECODER
+// Test VP9 codec controls after a decode error to ensure the code doesn't
+// misbehave.
+void TestVp9Controls(vpx_codec_ctx_t *dec) {
+  static const int kControls[] = {
+    VP8D_GET_LAST_REF_UPDATES,
+    VP8D_GET_FRAME_CORRUPTED,
+    VP9D_GET_DISPLAY_SIZE,
+  };
+  int val[2];
+
+  for (int i = 0; i < NELEMENTS(kControls); ++i) {
+    const vpx_codec_err_t res = vpx_codec_control_(dec, kControls[i], val);
+    switch (kControls[i]) {
+      case VP8D_GET_FRAME_CORRUPTED:
+        EXPECT_EQ(VPX_CODEC_ERROR, res) << kControls[i];
+        break;
+      default:
+        EXPECT_EQ(VPX_CODEC_OK, res) << kControls[i];
+        break;
+    }
+    EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+              vpx_codec_control_(dec, kControls[i], NULL));
+  }
+
+  vp9_ref_frame_t ref;
+  ref.idx = 0;
+  EXPECT_EQ(VPX_CODEC_ERROR, vpx_codec_control(dec, VP9_GET_REFERENCE, &ref));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_control(dec, VP9_GET_REFERENCE, NULL));
+
+  vpx_ref_frame_t ref_copy;
+  const int width = 352;
+  const int height = 288;
+  ASSERT_TRUE(
+      vpx_img_alloc(&ref_copy.img, VPX_IMG_FMT_I420, width, height, 1) != NULL);
+  ref_copy.frame_type = VP8_LAST_FRAME;
+  EXPECT_EQ(VPX_CODEC_ERROR,
+            vpx_codec_control(dec, VP8_COPY_REFERENCE, &ref_copy));
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
+            vpx_codec_control(dec, VP8_COPY_REFERENCE, NULL));
+  vpx_img_free(&ref_copy.img);
+}
+
+TEST(DecodeAPI, Vp9InvalidDecode) {
+  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  const char filename[] =
+      "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
+  libvpx_test::IVFVideoSource video(filename);
+  video.Init();
+  video.Begin();
+  ASSERT_TRUE(!HasFailure());
+
+  vpx_codec_ctx_t dec;
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+  const uint32_t frame_size = static_cast<uint32_t>(video.frame_size());
+  EXPECT_EQ(VPX_CODEC_MEM_ERROR,
+            vpx_codec_decode(&dec, video.cxdata(), frame_size, NULL, 0));
+  vpx_codec_iter_t iter = NULL;
+  EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+
+  TestVp9Controls(&dec);
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+}
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index 9b423b353..11529b349 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -29,9 +29,9 @@ const double kUsecsInSec = 1000000.0;
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
  */
-typedef std::tr1::tuple<const char *, unsigned> decode_perf_param_t;
+typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
 
-const decode_perf_param_t kVP9DecodePerfVectors[] = {
+const DecodePerfParam kVP9DecodePerfVectors[] = {
   make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
   make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2),
   make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2),
@@ -64,7 +64,7 @@ const decode_perf_param_t kVP9DecodePerfVectors[] = {
    power/temp/min max frame decode times/etc
  */
 
-class DecodePerfTest : public ::testing::TestWithParam<decode_perf_param_t> {
+class DecodePerfTest : public ::testing::TestWithParam<DecodePerfParam> {
 };
 
 TEST_P(DecodePerfTest, PerfTest) {
@@ -92,6 +92,7 @@ TEST_P(DecodePerfTest, PerfTest) {
   const double fps = double(frames) / elapsed_secs;
 
   printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
   printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
   printf("\t\"videoName\" : \"%s\",\n", video_name);
   printf("\t\"threadCount\" : %u,\n", threads);
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 3e4ef0ad1..99610ebc5 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -67,24 +67,33 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
                           const vpx_codec_dec_cfg_t &dec_cfg) {
   Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);
   ASSERT_TRUE(decoder != NULL);
+  bool end_of_file = false;
 
   // Decode frames.
-  for (video->Begin(); !::testing::Test::HasFailure() && video->cxdata();
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
        video->Next()) {
     PreDecodeFrameHook(*video, decoder);
 
     vpx_codec_stream_info_t stream_info;
     stream_info.sz = sizeof(stream_info);
-    const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
-                                                         video->frame_size(),
-                                                         &stream_info);
-    HandlePeekResult(decoder, video, res_peek);
-    ASSERT_FALSE(::testing::Test::HasFailure());
 
-    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
-                                                   video->frame_size());
-    if (!HandleDecodeResult(res_dec, *video, decoder))
-      break;
+    if (video->cxdata() != NULL) {
+      const vpx_codec_err_t res_peek = decoder->PeekStream(video->cxdata(),
+                                                           video->frame_size(),
+                                                           &stream_info);
+      HandlePeekResult(decoder, video, res_peek);
+      ASSERT_FALSE(::testing::Test::HasFailure());
+
+      vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
+                                                     video->frame_size());
+      if (!HandleDecodeResult(res_dec, *video, decoder))
+        break;
+    } else {
+      // Signal end of the file to the decoder.
+      const vpx_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }
 
     DxDataIterator dec_iter = decoder->GetDxData();
     const vpx_image_t *img = NULL;
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 6cb7d0e6e..854b74f84 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -44,8 +44,8 @@ decode_to_md5() {
 
   [ -e "${output_file}" ] || return 1
 
-  local md5_last_frame=$(tail -n1 "${output_file}")
-  local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:])
+  local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+  local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
   [ "${actual_md5}" = "${expected_md5}" ] || return 1
 }
 
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
new file mode 100644
index 000000000..feef37e7b
--- /dev/null
+++ b/test/encode_perf_test.cc
@@ -0,0 +1,170 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./vpx_config.h"
+#include "./vpx_version.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_),
+        width(width_),
+        height(height_),
+        bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+  EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+  EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+  EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+  EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv",
+                      640, 480, 200, 300),
+  EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 12 };
+
+#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
+
+class VP9EncodePerfTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  VP9EncodePerfTest()
+      : EncoderTest(GET_PARAM(0)),
+        min_psnr_(kMaxPsnr),
+        nframes_(0),
+        encoding_mode_(GET_PARAM(1)),
+        speed_(0) {}
+
+  virtual ~VP9EncodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_CPUUSED, speed_);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) {
+      min_psnr_= pkt->data.psnr.psnr[0];
+    }
+  }
+
+  // for performance reasons don't decode
+  virtual bool DoDecode() { return 0; }
+
+  double min_psnr() const {
+    return min_psnr_;
+  }
+
+  void set_speed(unsigned int speed) {
+    speed_ = speed;
+  }
+
+ private:
+  double min_psnr_;
+  unsigned int nframes_;
+  libvpx_test::TestMode encoding_mode_;
+  unsigned speed_;
+};
+
+TEST_P(VP9EncodePerfTest, PerfTest) {
+  for (size_t i = 0; i < NELEMENTS(kVP9EncodePerfTestVectors); ++i) {
+    for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
+      SetUp();
+
+      const vpx_rational timebase = { 33333333, 1000000000 };
+      cfg_.g_timebase = timebase;
+      cfg_.rc_target_bitrate = kVP9EncodePerfTestVectors[i].bitrate;
+
+      init_flags_ = VPX_CODEC_USE_PSNR;
+
+      const unsigned frames = kVP9EncodePerfTestVectors[i].frames;
+      const char *video_name = kVP9EncodePerfTestVectors[i].name;
+      libvpx_test::I420VideoSource video(
+          video_name,
+          kVP9EncodePerfTestVectors[i].width,
+          kVP9EncodePerfTestVectors[i].height,
+          timebase.den, timebase.num, 0,
+          kVP9EncodePerfTestVectors[i].frames);
+      set_speed(kEncodePerfTestSpeeds[j]);
+
+      vpx_usec_timer t;
+      vpx_usec_timer_start(&t);
+
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+      vpx_usec_timer_mark(&t);
+      const double elapsed_secs = vpx_usec_timer_elapsed(&t) / kUsecsInSec;
+      const double fps = frames / elapsed_secs;
+      const double minimum_psnr = min_psnr();
+
+      printf("{\n");
+      printf("\t\"type\" : \"encode_perf_test\",\n");
+      printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+      printf("\t\"videoName\" : \"%s\",\n", video_name);
+      printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+      printf("\t\"totalFrames\" : %u,\n", frames);
+      printf("\t\"framesPerSecond\" : %f,\n", fps);
+      printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+      printf("\t\"speed\" : %d\n", kEncodePerfTestSpeeds[j]);
+      printf("}\n");
+    }
+  }
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    VP9EncodePerfTest, ::testing::Values(::libvpx_test::kRealTime));
+}  // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 278133aac..9500fe49c 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -30,15 +30,15 @@ using libvpx_test::ACMRandom;
 
 namespace {
 const int kNumCoeffs = 16;
-typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride,
-                       int tx_type);
-typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride,
-                       int tx_type);
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
 
-typedef std::tr1::tuple<fdct_t, idct_t, int, int> dct_4x4_param_t;
-typedef std::tr1::tuple<fht_t, iht_t, int, int> ht_4x4_param_t;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, int> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, int> Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fdct4x4_c(in, out, stride);
@@ -245,14 +245,14 @@ class Trans4x4TestBase {
 
   int pitch_;
   int tx_type_;
-  fht_t fwd_txfm_ref;
   int bit_depth_;
   int mask_;
+  FhtFunc fwd_txfm_ref;
 };
 
 class Trans4x4DCT
     : public Trans4x4TestBase,
-      public ::testing::TestWithParam<dct_4x4_param_t> {
+      public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4DCT() {}
 
@@ -275,8 +275,8 @@ class Trans4x4DCT
     inv_txfm_(out, dst, stride);
   }
 
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
 };
 
 TEST_P(Trans4x4DCT, AccuracyCheck) {
@@ -297,7 +297,7 @@ TEST_P(Trans4x4DCT, InvAccuracyCheck) {
 
 class Trans4x4HT
     : public Trans4x4TestBase,
-      public ::testing::TestWithParam<ht_4x4_param_t> {
+      public ::testing::TestWithParam<Ht4x4Param> {
  public:
   virtual ~Trans4x4HT() {}
 
@@ -321,8 +321,8 @@ class Trans4x4HT
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
-  fht_t fwd_txfm_;
-  iht_t inv_txfm_;
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
 };
 
 TEST_P(Trans4x4HT, AccuracyCheck) {
@@ -343,7 +343,7 @@ TEST_P(Trans4x4HT, InvAccuracyCheck) {
 
 class Trans4x4WHT
     : public Trans4x4TestBase,
-      public ::testing::TestWithParam<dct_4x4_param_t> {
+      public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4WHT() {}
 
@@ -366,8 +366,8 @@ class Trans4x4WHT
     inv_txfm_(out, dst, stride);
   }
 
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
 };
 
 TEST_P(Trans4x4WHT, AccuracyCheck) {
@@ -477,19 +477,4 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, 8)));
 #endif
 
-#if HAVE_AVX2 && !CONFIG_VP9_HIGH
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vp9_fdct4x4_avx2,
-                   &vp9_idct4x4_16_add_c, 0, 8)));
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0, 8),
-        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1, 8),
-        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2, 8),
-        make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3, 8)));
-#endif
-
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index e46091f97..fad424fe8 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -65,15 +65,15 @@ void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*fdct_t)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*idct_t)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*fht_t) (const int16_t *in, tran_low_t *out, int stride,
-                       int tx_type);
-typedef void (*iht_t) (const tran_low_t *in, uint8_t *out, int stride,
-                       int tx_type);
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
 
-typedef std::tr1::tuple<fdct_t, idct_t, int, int> dct_8x8_param_t;
-typedef std::tr1::tuple<fht_t, iht_t, int, int> ht_8x8_param_t;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, int> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, int> Ht8x8Param;
 
 void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
   vp9_fdct8x8_c(in, out, stride);
@@ -412,14 +412,14 @@ class FwdTrans8x8TestBase {
 
   int pitch_;
   int tx_type_;
-  fht_t fwd_txfm_ref;
+  FhtFunc fwd_txfm_ref;
   int bit_depth_;
   int mask_;
 };
 
 class FwdTrans8x8DCT
     : public FwdTrans8x8TestBase,
-      public ::testing::TestWithParam<dct_8x8_param_t> {
+      public ::testing::TestWithParam<Dct8x8Param> {
  public:
   virtual ~FwdTrans8x8DCT() {}
 
@@ -443,8 +443,8 @@ class FwdTrans8x8DCT
     inv_txfm_(out, dst, stride);
   }
 
-  fdct_t fwd_txfm_;
-  idct_t inv_txfm_;
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
 };
 
 TEST_P(FwdTrans8x8DCT, SignBiasCheck) {
@@ -469,7 +469,7 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
 
 class FwdTrans8x8HT
     : public FwdTrans8x8TestBase,
-      public ::testing::TestWithParam<ht_8x8_param_t> {
+      public ::testing::TestWithParam<Ht8x8Param> {
  public:
   virtual ~FwdTrans8x8HT() {}
 
@@ -493,8 +493,8 @@ class FwdTrans8x8HT
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
-  fht_t fwd_txfm_;
-  iht_t inv_txfm_;
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
 };
 
 TEST_P(FwdTrans8x8HT, SignBiasCheck) {
@@ -554,7 +554,7 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0, 8)));
+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0, 8)));
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, FwdTrans8x8HT,
     ::testing::Values(
@@ -584,18 +584,4 @@ INSTANTIATE_TEST_CASE_P(
     ::testing::Values(
         make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0, 8)));
 #endif
-
-#if HAVE_AVX2 && !CONFIG_VP9_HIGH
-INSTANTIATE_TEST_CASE_P(
-    AVX2, FwdTrans8x8DCT,
-    ::testing::Values(
-        make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0, 8)));
-INSTANTIATE_TEST_CASE_P(
-    AVX2, FwdTrans8x8HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0, 8),
-        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1, 8),
-        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2, 8),
-        make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3, 8)));
-#endif
 }  // namespace
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index d1ea0967f..2400c2021 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -7,13 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <climits>
-#include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
+#include "test/video_source.h"
 
 namespace {
 
@@ -33,10 +29,7 @@ class VP9FrameSizeTestsLarge
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
                                   const libvpx_test::VideoSource &video,
                                   libvpx_test::Decoder *decoder) {
-    EXPECT_EQ(expected_res_, res_dec)
-        << "Expected " << expected_res_
-        << "but got " << res_dec;
-
+    EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
@@ -62,16 +55,6 @@ TEST_F(VP9FrameSizeTestsLarge, TestInvalidSizes) {
   video.set_limit(2);
   expected_res_ = VPX_CODEC_CORRUPT_FRAME;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-#else
-  // If we are on a 32 bit platform we can't possibly allocate enough memory
-  // for the largest video frame size (64kx64k). This test checks that we
-  // properly return a memory error.
-  if (sizeof(size_t) == 4) {
-    video.SetSize(65535, 65535);
-    video.set_limit(2);
-    expected_res_ = VPX_CODEC_MEM_ERROR;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  }
 #endif
 }
 
diff --git a/test/idct_test.cc b/test/idct_test.cc
index c7f609d58..2ff9e6446 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -16,11 +16,11 @@
 
 #include "vpx/vpx_integer.h"
 
-typedef void (*idct_fn_t)(int16_t *input, unsigned char *pred_ptr,
-                          int pred_stride, unsigned char *dst_ptr,
-                          int dst_stride);
+typedef void (*IdctFunc)(int16_t *input, unsigned char *pred_ptr,
+                         int pred_stride, unsigned char *dst_ptr,
+                         int dst_stride);
 namespace {
-class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
+class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
   virtual void SetUp() {
     int i;
@@ -33,7 +33,7 @@ class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {
 
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
-  idct_fn_t UUT;
+  IdctFunc UUT;
   int16_t input[16];
   unsigned char output[256];
   unsigned char predict[256];
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index d93251dfe..ead476030 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -216,16 +216,16 @@ class IntraPredBase {
   int num_planes_;
 };
 
-typedef void (*intra_pred_y_fn_t)(MACROBLOCKD *x,
-                                  uint8_t *yabove_row,
-                                  uint8_t *yleft,
-                                  int left_stride,
-                                  uint8_t *ypred_ptr,
-                                  int y_stride);
+typedef void (*IntraPredYFunc)(MACROBLOCKD *x,
+                               uint8_t *yabove_row,
+                               uint8_t *yleft,
+                               int left_stride,
+                               uint8_t *ypred_ptr,
+                               int y_stride);
 
 class IntraPredYTest
     : public IntraPredBase,
-      public ::testing::TestWithParam<intra_pred_y_fn_t> {
+      public ::testing::TestWithParam<IntraPredYFunc> {
  public:
   static void SetUpTestCase() {
     mb_ = reinterpret_cast<MACROBLOCKD*>(
@@ -267,7 +267,7 @@ class IntraPredYTest
                                       data_ptr_[0], kStride));
   }
 
-  intra_pred_y_fn_t pred_fn_;
+  IntraPredYFunc pred_fn_;
   static uint8_t* data_array_;
   static MACROBLOCKD * mb_;
   static MODE_INFO *mi_;
@@ -295,19 +295,19 @@ INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
                             vp8_build_intra_predictors_mby_s_ssse3));
 #endif
 
-typedef void (*intra_pred_uv_fn_t)(MACROBLOCKD *x,
-                                   uint8_t *uabove_row,
-                                   uint8_t *vabove_row,
-                                   uint8_t *uleft,
-                                   uint8_t *vleft,
-                                   int left_stride,
-                                   uint8_t *upred_ptr,
-                                   uint8_t *vpred_ptr,
-                                   int pred_stride);
+typedef void (*IntraPredUvFunc)(MACROBLOCKD *x,
+                                uint8_t *uabove_row,
+                                uint8_t *vabove_row,
+                                uint8_t *uleft,
+                                uint8_t *vleft,
+                                int left_stride,
+                                uint8_t *upred_ptr,
+                                uint8_t *vpred_ptr,
+                                int pred_stride);
 
 class IntraPredUVTest
     : public IntraPredBase,
-      public ::testing::TestWithParam<intra_pred_uv_fn_t> {
+      public ::testing::TestWithParam<IntraPredUvFunc> {
  public:
   static void SetUpTestCase() {
     mb_ = reinterpret_cast<MACROBLOCKD*>(
@@ -349,7 +349,7 @@ class IntraPredUVTest
              data_ptr_[0], data_ptr_[1], kStride);
   }
 
-  intra_pred_uv_fn_t pred_fn_;
+  IntraPredUvFunc pred_fn_;
   // We use 24 so that the data pointer of the first pixel in each row of
   // each macroblock is 8-byte aligned, and this gives us access to the
   // top-left and top-right corner pixels belonging to the top-left/right
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 777e0e9fc..eb333fe37 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -26,14 +26,14 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-typedef void (*fwd_txfm_t)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*inv_txfm_t)(const tran_low_t *in, uint8_t *out, int stride);
-typedef std::tr1::tuple<fwd_txfm_t,
-                        inv_txfm_t,
-                        inv_txfm_t,
-                        TX_SIZE, int> partial_itxfm_param_t;
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef std::tr1::tuple<FwdTxfmFunc,
+                        InvTxfmFunc,
+                        InvTxfmFunc,
+                        TX_SIZE, int> PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
-class PartialIDctTest : public ::testing::TestWithParam<partial_itxfm_param_t> {
+class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
   virtual ~PartialIDctTest() {}
   virtual void SetUp() {
@@ -49,9 +49,9 @@ class PartialIDctTest : public ::testing::TestWithParam<partial_itxfm_param_t> {
  protected:
   int last_nonzero_;
   TX_SIZE tx_size_;
-  fwd_txfm_t ftxfm_;
-  inv_txfm_t full_itxfm_;
-  inv_txfm_t partial_itxfm_;
+  FwdTxfmFunc ftxfm_;
+  InvTxfmFunc full_itxfm_;
+  InvTxfmFunc partial_itxfm_;
 };
 
 TEST_P(PartialIDctTest, RunQuantCheck) {
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 1144083ad..a9b16e055 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -15,18 +15,18 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*post_proc_func_t)(unsigned char *src_ptr,
-                                 unsigned char *dst_ptr,
-                                 int src_pixels_per_line,
-                                 int dst_pixels_per_line,
-                                 int cols,
-                                 unsigned char *flimit,
-                                 int size);
+typedef void (*PostProcFunc)(unsigned char *src_ptr,
+                             unsigned char *dst_ptr,
+                             int src_pixels_per_line,
+                             int dst_pixels_per_line,
+                             int cols,
+                             unsigned char *flimit,
+                             int size);
 
 namespace {
 
 class VP8PostProcessingFilterTest
-    : public ::testing::TestWithParam<post_proc_func_t> {
+    : public ::testing::TestWithParam<PostProcFunc> {
  public:
   virtual void TearDown() {
     libvpx_test::ClearSystemState();
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 23dbd0565..e63770bd4 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -30,29 +30,27 @@
 
 
 #if CONFIG_VP8_ENCODER
-typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
-                                        int source_stride,
-                                        const unsigned char *reference_ptr,
-                                        int reference_stride,
-                                        unsigned int max_sad);
-typedef std::tr1::tuple<int, int, sad_m_by_n_fn_t> sad_m_by_n_test_param_t;
+typedef unsigned int (*SadMxNFunc)(const unsigned char *source_ptr,
+                                   int source_stride,
+                                   const unsigned char *reference_ptr,
+                                   int reference_stride,
+                                   unsigned int max_sad);
+typedef std::tr1::tuple<int, int, SadMxNFunc> SadMxNParam;
 #endif
 #if CONFIG_VP9_ENCODER
-typedef unsigned int (*sad_m_by_n_fn_vp9_t)(const unsigned char *source_ptr,
-                                            int source_stride,
-                                            const unsigned char *reference_ptr,
-                                            int reference_stride);
-typedef std::tr1::tuple<int, int, sad_m_by_n_fn_vp9_t>
-                  sad_m_by_n_test_param_vp9_t;
+typedef unsigned int (*SadMxNVp9Func)(const unsigned char *source_ptr,
+                                      int source_stride,
+                                      const unsigned char *reference_ptr,
+                                      int reference_stride);
+typedef std::tr1::tuple<int, int, SadMxNVp9Func> SadMxNVp9Param;
 #endif
 
-typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,
-                                     int src_stride,
-                                     const unsigned char * const ref_ptr[],
-                                     int ref_stride,
-                                     unsigned int *sad_array);
-typedef std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t>
-        sad_n_by_n_by_4_test_param_t;
+typedef void (*SadMxNx4Func)(const uint8_t *src_ptr,
+                             int src_stride,
+                             const unsigned char *const ref_ptr[],
+                             int ref_stride,
+                             unsigned int *sad_array);
+typedef std::tr1::tuple<int, int, SadMxNx4Func> SadMxNx4Param;
 
 using libvpx_test::ACMRandom;
 
@@ -140,7 +138,7 @@ class SADTestBase : public ::testing::Test {
 
 class SADx4Test
     : public SADTestBase,
-      public ::testing::WithParamInterface<sad_n_by_n_by_4_test_param_t> {
+      public ::testing::WithParamInterface<SadMxNx4Param> {
  public:
   SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -169,7 +167,7 @@ class SADx4Test
 #if CONFIG_VP8_ENCODER
 class SADTest
     : public SADTestBase,
-      public ::testing::WithParamInterface<sad_m_by_n_test_param_t> {
+      public ::testing::WithParamInterface<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -201,7 +199,7 @@ class SADTest
 #if CONFIG_VP9_ENCODER
 class SADVP9Test
     : public SADTestBase,
-      public ::testing::WithParamInterface<sad_m_by_n_test_param_vp9_t> {
+      public ::testing::WithParamInterface<SadMxNVp9Param> {
  public:
   SADVP9Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}
 
@@ -382,12 +380,12 @@ using std::tr1::make_tuple;
 //------------------------------------------------------------------------------
 // C functions
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
-const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
-const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
-const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
-const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
-const sad_m_by_n_test_param_t c_tests[] = {
+const SadMxNFunc sad_16x16_c = vp8_sad16x16_c;
+const SadMxNFunc sad_8x16_c = vp8_sad8x16_c;
+const SadMxNFunc sad_16x8_c = vp8_sad16x8_c;
+const SadMxNFunc sad_8x8_c = vp8_sad8x8_c;
+const SadMxNFunc sad_4x4_c = vp8_sad4x4_c;
+const SadMxNParam c_tests[] = {
   make_tuple(16, 16, sad_16x16_c),
   make_tuple(8, 16, sad_8x16_c),
   make_tuple(16, 8, sad_16x8_c),
@@ -398,16 +396,16 @@ INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 #endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_vp9_t sad_64x64_c_vp9 = vp9_sad64x64_c;
-const sad_m_by_n_fn_vp9_t sad_32x32_c_vp9 = vp9_sad32x32_c;
-const sad_m_by_n_fn_vp9_t sad_16x16_c_vp9 = vp9_sad16x16_c;
-const sad_m_by_n_fn_vp9_t sad_8x16_c_vp9 = vp9_sad8x16_c;
-const sad_m_by_n_fn_vp9_t sad_16x8_c_vp9 = vp9_sad16x8_c;
-const sad_m_by_n_fn_vp9_t sad_8x8_c_vp9 = vp9_sad8x8_c;
-const sad_m_by_n_fn_vp9_t sad_8x4_c_vp9 = vp9_sad8x4_c;
-const sad_m_by_n_fn_vp9_t sad_4x8_c_vp9 = vp9_sad4x8_c;
-const sad_m_by_n_fn_vp9_t sad_4x4_c_vp9 = vp9_sad4x4_c;
-const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = {
+const SadMxNVp9Func sad_64x64_c_vp9 = vp9_sad64x64_c;
+const SadMxNVp9Func sad_32x32_c_vp9 = vp9_sad32x32_c;
+const SadMxNVp9Func sad_16x16_c_vp9 = vp9_sad16x16_c;
+const SadMxNVp9Func sad_8x16_c_vp9 = vp9_sad8x16_c;
+const SadMxNVp9Func sad_16x8_c_vp9 = vp9_sad16x8_c;
+const SadMxNVp9Func sad_8x8_c_vp9 = vp9_sad8x8_c;
+const SadMxNVp9Func sad_8x4_c_vp9 = vp9_sad8x4_c;
+const SadMxNVp9Func sad_4x8_c_vp9 = vp9_sad4x8_c;
+const SadMxNVp9Func sad_4x4_c_vp9 = vp9_sad4x4_c;
+const SadMxNVp9Param c_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_c_vp9),
   make_tuple(32, 32, sad_32x32_c_vp9),
   make_tuple(16, 16, sad_16x16_c_vp9),
@@ -420,19 +418,19 @@ const sad_m_by_n_test_param_vp9_t c_vp9_tests[] = {
 };
 INSTANTIATE_TEST_CASE_P(C, SADVP9Test, ::testing::ValuesIn(c_vp9_tests));
 
-const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;
-const sad_n_by_n_by_4_fn_t sad_64x32x4d_c = vp9_sad64x32x4d_c;
-const sad_n_by_n_by_4_fn_t sad_32x64x4d_c = vp9_sad32x64x4d_c;
-const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;
-const sad_n_by_n_by_4_fn_t sad_32x16x4d_c = vp9_sad32x16x4d_c;
-const sad_n_by_n_by_4_fn_t sad_16x32x4d_c = vp9_sad16x32x4d_c;
-const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;
-const sad_n_by_n_by_4_fn_t sad_16x8x4d_c = vp9_sad16x8x4d_c;
-const sad_n_by_n_by_4_fn_t sad_8x16x4d_c = vp9_sad8x16x4d_c;
-const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;
-const sad_n_by_n_by_4_fn_t sad_8x4x4d_c = vp9_sad8x4x4d_c;
-const sad_n_by_n_by_4_fn_t sad_4x8x4d_c = vp9_sad4x8x4d_c;
-const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;
+const SadMxNx4Func sad_64x64x4d_c = vp9_sad64x64x4d_c;
+const SadMxNx4Func sad_64x32x4d_c = vp9_sad64x32x4d_c;
+const SadMxNx4Func sad_32x64x4d_c = vp9_sad32x64x4d_c;
+const SadMxNx4Func sad_32x32x4d_c = vp9_sad32x32x4d_c;
+const SadMxNx4Func sad_32x16x4d_c = vp9_sad32x16x4d_c;
+const SadMxNx4Func sad_16x32x4d_c = vp9_sad16x32x4d_c;
+const SadMxNx4Func sad_16x16x4d_c = vp9_sad16x16x4d_c;
+const SadMxNx4Func sad_16x8x4d_c = vp9_sad16x8x4d_c;
+const SadMxNx4Func sad_8x16x4d_c = vp9_sad8x16x4d_c;
+const SadMxNx4Func sad_8x8x4d_c = vp9_sad8x8x4d_c;
+const SadMxNx4Func sad_8x4x4d_c = vp9_sad8x4x4d_c;
+const SadMxNx4Func sad_4x8x4d_c = vp9_sad4x8x4d_c;
+const SadMxNx4Func sad_4x4x4d_c = vp9_sad4x4x4d_c;
 INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_c),
                         make_tuple(64, 32, sad_64x32x4d_c),
@@ -453,7 +451,7 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(
 // ARM functions
 #if HAVE_MEDIA
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
+const SadMxNFunc sad_16x16_armv6 = vp8_sad16x16_armv6;
 INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_armv6)));
 #endif  // CONFIG_VP8_ENCODER
@@ -461,11 +459,11 @@ INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
 
 #if HAVE_NEON
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon;
-const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon;
-const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon;
-const sad_m_by_n_fn_t sad_8x8_neon = vp8_sad8x8_neon;
-const sad_m_by_n_fn_t sad_4x4_neon = vp8_sad4x4_neon;
+const SadMxNFunc sad_16x16_neon = vp8_sad16x16_neon;
+const SadMxNFunc sad_8x16_neon = vp8_sad8x16_neon;
+const SadMxNFunc sad_16x8_neon = vp8_sad16x8_neon;
+const SadMxNFunc sad_8x8_neon = vp8_sad8x8_neon;
+const SadMxNFunc sad_4x4_neon = vp8_sad4x4_neon;
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_neon),
                         make_tuple(8, 16, sad_8x16_neon),
@@ -474,13 +472,15 @@ INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_neon)));
 #endif  // CONFIG_VP8_ENCODER
 #if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_vp9_t sad_64x64_neon_vp9 = vp9_sad64x64_neon;
-const sad_m_by_n_fn_vp9_t sad_32x32_neon_vp9 = vp9_sad32x32_neon;
-const sad_m_by_n_fn_vp9_t sad_16x16_neon_vp9 = vp9_sad16x16_neon;
-const sad_m_by_n_test_param_vp9_t neon_vp9_tests[] = {
+const SadMxNVp9Func sad_64x64_neon_vp9 = vp9_sad64x64_neon;
+const SadMxNVp9Func sad_32x32_neon_vp9 = vp9_sad32x32_neon;
+const SadMxNVp9Func sad_16x16_neon_vp9 = vp9_sad16x16_neon;
+const SadMxNVp9Func sad_8x8_neon_vp9 = vp9_sad8x8_neon;
+const SadMxNVp9Param neon_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_neon_vp9),
   make_tuple(32, 32, sad_32x32_neon_vp9),
   make_tuple(16, 16, sad_16x16_neon_vp9),
+  make_tuple(8, 8, sad_8x8_neon_vp9),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests));
 #endif  // CONFIG_VP9_ENCODER
@@ -490,12 +490,12 @@ INSTANTIATE_TEST_CASE_P(NEON, SADVP9Test, ::testing::ValuesIn(neon_vp9_tests));
 // x86 functions
 #if HAVE_MMX
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
-const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx;
-const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
-const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
-const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
-const sad_m_by_n_test_param_t mmx_tests[] = {
+const SadMxNFunc sad_16x16_mmx = vp8_sad16x16_mmx;
+const SadMxNFunc sad_8x16_mmx = vp8_sad8x16_mmx;
+const SadMxNFunc sad_16x8_mmx = vp8_sad16x8_mmx;
+const SadMxNFunc sad_8x8_mmx = vp8_sad8x8_mmx;
+const SadMxNFunc sad_4x4_mmx = vp8_sad4x4_mmx;
+const SadMxNParam mmx_tests[] = {
   make_tuple(16, 16, sad_16x16_mmx),
   make_tuple(8, 16, sad_8x16_mmx),
   make_tuple(16, 8, sad_16x8_mmx),
@@ -506,12 +506,12 @@ INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
 #endif  // CONFIG_VP8_ENCODER
 
 #if CONFIG_VP9_ENCODER
-const sad_m_by_n_fn_vp9_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
-const sad_m_by_n_fn_vp9_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
-const sad_m_by_n_fn_vp9_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
-const sad_m_by_n_fn_vp9_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
-const sad_m_by_n_fn_vp9_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
-const sad_m_by_n_test_param_vp9_t mmx_vp9_tests[] = {
+const SadMxNVp9Func sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;
+const SadMxNVp9Func sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;
+const SadMxNVp9Func sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;
+const SadMxNVp9Func sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;
+const SadMxNVp9Func sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;
+const SadMxNVp9Param mmx_vp9_tests[] = {
   make_tuple(16, 16, sad_16x16_mmx_vp9),
   make_tuple(8, 16, sad_8x16_mmx_vp9),
   make_tuple(16, 8, sad_16x8_mmx_vp9),
@@ -525,14 +525,14 @@ INSTANTIATE_TEST_CASE_P(MMX, SADVP9Test, ::testing::ValuesIn(mmx_vp9_tests));
 #if HAVE_SSE
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const sad_m_by_n_fn_vp9_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;
-const sad_m_by_n_fn_vp9_t sad_4x8_sse_vp9 = vp9_sad4x8_sse;
+const SadMxNVp9Func sad_4x4_sse_vp9 = vp9_sad4x4_sse;
+const SadMxNVp9Func sad_4x8_sse_vp9 = vp9_sad4x8_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADVP9Test, ::testing::Values(
                         make_tuple(4, 4, sad_4x4_sse_vp9),
                         make_tuple(4, 8, sad_4x8_sse_vp9)));
 
-const sad_n_by_n_by_4_fn_t sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
-const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
+const SadMxNx4Func sad_4x8x4d_sse = vp9_sad4x8x4d_sse;
+const SadMxNx4Func sad_4x4x4d_sse = vp9_sad4x4x4d_sse;
 INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
                         make_tuple(4, 8, sad_4x8x4d_sse),
                         make_tuple(4, 4, sad_4x4x4d_sse)));
@@ -542,12 +542,12 @@ INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(
 
 #if HAVE_SSE2
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt;
-const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt;
-const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
-const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
-const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
-const sad_m_by_n_test_param_t sse2_tests[] = {
+const SadMxNFunc sad_16x16_wmt = vp8_sad16x16_wmt;
+const SadMxNFunc sad_8x16_wmt = vp8_sad8x16_wmt;
+const SadMxNFunc sad_16x8_wmt = vp8_sad16x8_wmt;
+const SadMxNFunc sad_8x8_wmt = vp8_sad8x8_wmt;
+const SadMxNFunc sad_4x4_wmt = vp8_sad4x4_wmt;
+const SadMxNParam sse2_tests[] = {
   make_tuple(16, 16, sad_16x16_wmt),
   make_tuple(8, 16, sad_8x16_wmt),
   make_tuple(16, 8, sad_16x8_wmt),
@@ -559,18 +559,18 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 #if CONFIG_VP9_ENCODER
 #if CONFIG_USE_X86INC
-const sad_m_by_n_fn_vp9_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
-const sad_m_by_n_fn_vp9_t sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
-const sad_m_by_n_fn_vp9_t sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
-const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = {
+const SadMxNVp9Func sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;
+const SadMxNVp9Func sad_64x32_sse2_vp9 = vp9_sad64x32_sse2;
+const SadMxNVp9Func sad_32x64_sse2_vp9 = vp9_sad32x64_sse2;
+const SadMxNVp9Func sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;
+const SadMxNVp9Func sad_32x16_sse2_vp9 = vp9_sad32x16_sse2;
+const SadMxNVp9Func sad_16x32_sse2_vp9 = vp9_sad16x32_sse2;
+const SadMxNVp9Func sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;
+const SadMxNVp9Func sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;
+const SadMxNVp9Func sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;
+const SadMxNVp9Func sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;
+const SadMxNVp9Func sad_8x4_sse2_vp9 = vp9_sad8x4_sse2;
+const SadMxNVp9Param sse2_vp9_tests[] = {
   make_tuple(64, 64, sad_64x64_sse2_vp9),
   make_tuple(64, 32, sad_64x32_sse2_vp9),
   make_tuple(32, 64, sad_32x64_sse2_vp9),
@@ -585,17 +585,17 @@ const sad_m_by_n_test_param_vp9_t sse2_vp9_tests[] = {
 };
 INSTANTIATE_TEST_CASE_P(SSE2, SADVP9Test, ::testing::ValuesIn(sse2_vp9_tests));
 
-const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
-const sad_n_by_n_by_4_fn_t sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
+const SadMxNx4Func sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;
+const SadMxNx4Func sad_64x32x4d_sse2 = vp9_sad64x32x4d_sse2;
+const SadMxNx4Func sad_32x64x4d_sse2 = vp9_sad32x64x4d_sse2;
+const SadMxNx4Func sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;
+const SadMxNx4Func sad_32x16x4d_sse2 = vp9_sad32x16x4d_sse2;
+const SadMxNx4Func sad_16x32x4d_sse2 = vp9_sad16x32x4d_sse2;
+const SadMxNx4Func sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;
+const SadMxNx4Func sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;
+const SadMxNx4Func sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;
+const SadMxNx4Func sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;
+const SadMxNx4Func sad_8x4x4d_sse2 = vp9_sad8x4x4d_sse2;
 INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
                         make_tuple(64, 64, sad_64x64x4d_sse2),
                         make_tuple(64, 32, sad_64x32x4d_sse2),
@@ -614,11 +614,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(
 
 #if HAVE_SSE3
 #if CONFIG_VP8_ENCODER
-const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3;
-const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3;
-const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3;
-const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
-const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
+const SadMxNx4Func sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3;
+const SadMxNx4Func sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3;
+const SadMxNx4Func sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3;
+const SadMxNx4Func sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;
+const SadMxNx4Func sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
                         make_tuple(16, 16, sad_16x16x4d_sse3),
                         make_tuple(16, 8, sad_16x8x4d_sse3),
@@ -631,7 +631,7 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
 #if CONFIG_VP8_ENCODER
-const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
+const SadMxNFunc sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
 #endif  // CONFIG_VP8_ENCODER
@@ -640,19 +640,9 @@ INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
 
 #if HAVE_AVX2
 #if CONFIG_VP9_ENCODER
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_ptr[], int ref_stride,
-                          unsigned int *sad_array);
-void vp9_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_ptr[], int ref_stride,
-                          unsigned int *sad_array);
-}
-const sad_n_by_n_by_4_fn_t sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
-const sad_n_by_n_by_4_fn_t sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX2, SADx4Test, ::testing::Values(
+const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
+const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
+INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
                         make_tuple(32, 32, sad_32x32x4d_avx2),
                         make_tuple(64, 64, sad_64x64x4d_avx2)));
 #endif  // CONFIG_VP9_ENCODER
diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
index ac7aa9947..1e6d91547 100644
--- a/test/sixtap_predict_test.cc
+++ b/test/sixtap_predict_test.cc
@@ -23,17 +23,17 @@
 
 namespace {
 
-typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr,
-                                    int  src_pixels_per_line,
-                                    int  xoffset,
-                                    int  yoffset,
-                                    uint8_t *dst_ptr,
-                                    int  dst_pitch);
+typedef void (*SixtapPredictFunc)(uint8_t *src_ptr,
+                                  int src_pixels_per_line,
+                                  int xoffset,
+                                  int yoffset,
+                                  uint8_t *dst_ptr,
+                                  int dst_pitch);
 
-typedef std::tr1::tuple<int, int, sixtap_predict_fn_t> sixtap_predict_param_t;
+typedef std::tr1::tuple<int, int, SixtapPredictFunc> SixtapPredictParam;
 
 class SixtapPredictTest
-    : public ::testing::TestWithParam<sixtap_predict_param_t> {
+    : public ::testing::TestWithParam<SixtapPredictParam> {
  public:
   static void SetUpTestCase() {
     src_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kSrcSize));
@@ -74,7 +74,7 @@ class SixtapPredictTest
 
   int width_;
   int height_;
-  sixtap_predict_fn_t sixtap_predict_;
+  SixtapPredictFunc sixtap_predict_;
   // The src stores the macroblock we will filter on, and makes it 1 byte larger
   // in order to test unaligned access. The result is stored in dst and dst_c(c
   // reference code result).
@@ -184,10 +184,10 @@ TEST_P(SixtapPredictTest, TestWithRandomData) {
 
 using std::tr1::make_tuple;
 
-const sixtap_predict_fn_t sixtap_16x16_c = vp8_sixtap_predict16x16_c;
-const sixtap_predict_fn_t sixtap_8x8_c = vp8_sixtap_predict8x8_c;
-const sixtap_predict_fn_t sixtap_8x4_c = vp8_sixtap_predict8x4_c;
-const sixtap_predict_fn_t sixtap_4x4_c = vp8_sixtap_predict4x4_c;
+const SixtapPredictFunc sixtap_16x16_c = vp8_sixtap_predict16x16_c;
+const SixtapPredictFunc sixtap_8x8_c = vp8_sixtap_predict8x8_c;
+const SixtapPredictFunc sixtap_8x4_c = vp8_sixtap_predict8x4_c;
+const SixtapPredictFunc sixtap_4x4_c = vp8_sixtap_predict4x4_c;
 INSTANTIATE_TEST_CASE_P(
     C, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_c),
@@ -195,9 +195,9 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(8, 4, sixtap_8x4_c),
         make_tuple(4, 4, sixtap_4x4_c)));
 #if HAVE_NEON
-const sixtap_predict_fn_t sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
-const sixtap_predict_fn_t sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
-const sixtap_predict_fn_t sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
+const SixtapPredictFunc sixtap_16x16_neon = vp8_sixtap_predict16x16_neon;
+const SixtapPredictFunc sixtap_8x8_neon = vp8_sixtap_predict8x8_neon;
+const SixtapPredictFunc sixtap_8x4_neon = vp8_sixtap_predict8x4_neon;
 INSTANTIATE_TEST_CASE_P(
     DISABLED_NEON, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_neon),
@@ -205,10 +205,10 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(8, 4, sixtap_8x4_neon)));
 #endif
 #if HAVE_MMX
-const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
-const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
-const sixtap_predict_fn_t sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
-const sixtap_predict_fn_t sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
+const SixtapPredictFunc sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
+const SixtapPredictFunc sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
+const SixtapPredictFunc sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
+const SixtapPredictFunc sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
 INSTANTIATE_TEST_CASE_P(
     MMX, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_mmx),
@@ -217,9 +217,9 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(4, 4, sixtap_4x4_mmx)));
 #endif
 #if HAVE_SSE2
-const sixtap_predict_fn_t sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
-const sixtap_predict_fn_t sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
-const sixtap_predict_fn_t sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
+const SixtapPredictFunc sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
+const SixtapPredictFunc sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
+const SixtapPredictFunc sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
     SSE2, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_sse2),
@@ -227,10 +227,10 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(8, 4, sixtap_8x4_sse2)));
 #endif
 #if HAVE_SSSE3
-const sixtap_predict_fn_t sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
-const sixtap_predict_fn_t sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
-const sixtap_predict_fn_t sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
-const sixtap_predict_fn_t sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
+const SixtapPredictFunc sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
+const SixtapPredictFunc sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
+const SixtapPredictFunc sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
+const SixtapPredictFunc sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
     SSSE3, SixtapPredictTest, ::testing::Values(
         make_tuple(16, 16, sixtap_16x16_ssse3),
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 2db3dd785..6619fb158 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc
@@ -18,11 +18,11 @@
 #include "vp8/encoder/block.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*subtract_b_fn_t)(BLOCK *be, BLOCKD *bd, int pitch);
+typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
 
 namespace {
 
-class SubtractBlockTest : public ::testing::TestWithParam<subtract_b_fn_t> {
+class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
  public:
   virtual void TearDown() {
     libvpx_test::ClearSystemState();
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 74f7842d4..ee6289f1e 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -669,3 +669,15 @@ c01bb7938f9a9f25e0c37afdec2f2fb73b6cc7fa  vp90-2-17-show-existing-frame.webm
 cc75f351818b9a619818f5cc77b9bc013d0c1e11  vp90-2-17-show-existing-frame.webm.md5
 0321d507ce62dedc8a51b4e9011f7a19aed9c3dc  vp91-2-04-yuv444.webm
 367e423dd41fdb49aa028574a2cfec5c2f325c5c  vp91-2-04-yuv444.webm.md5
+eb438c6540eb429f74404eedfa3228d409c57874  desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab  kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686  macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe  macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467  niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4  tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009  tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444  thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32  niklas_1280_720_30.yuv
+c77e4a26616add298a05dd5d12397be22c0e40c5  vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce  vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06  screendata.y4m
diff --git a/test/test.mk b/test/test.mk
index 888a4bbc7..44cff1455 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 endif
 
+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
 
@@ -69,6 +70,11 @@ ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER)$(CONFIG_WEBM_IO), \
 LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
 endif
 
+# encode perf tests are vp9 only
+ifeq ($(CONFIG_ENCODE_PERF_TESTS)$(CONFIG_VP9_ENCODER), yesyes)
+LIBVPX_TEST_SRCS-yes += encode_perf_test.cc
+endif
+
 ##
 ## WHITE BOX TESTS
 ##
@@ -95,6 +101,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
 
 endif # VP8
 
@@ -778,6 +785,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-16-intra-only.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-17-show-existing-frame.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-18-resize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yuv444.webm.md5
 
@@ -840,3 +849,15 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
   vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
 endif  # CONFIG_DECODE_PERF_TESTS
+
+ifeq ($(CONFIG_ENCODE_PERF_TESTS),yes)
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += desktop_640_360_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += kirland_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcomoving_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += macmarcostationary_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomanarrows_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += tacomasmallcameramovement_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += thaloundeskmtg_640_480_30.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_1280_720_30.yuv
+endif  # CONFIG_ENCODE_PERF_TESTS
diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 4ea4b9dab..dbdbdd6f9 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc
@@ -181,7 +181,7 @@ const char *const kVP9TestVectors[] = {
   "vp90-2-14-resize-fp-tiles-8-2.webm", "vp90-2-14-resize-fp-tiles-8-4.webm",
   "vp90-2-15-segkey.webm", "vp90-2-15-segkey_adpq.webm",
   "vp90-2-16-intra-only.webm", "vp90-2-17-show-existing-frame.webm",
-  "vp91-2-04-yuv444.webm",
+  "vp90-2-18-resize.ivf", "vp91-2-04-yuv444.webm",
 };
 const int kNumVP9TestVectors = NELEMENTS(kVP9TestVectors);
 #endif  // CONFIG_VP9_DECODER
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 3e69c3687..0bfefba46 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -182,97 +182,6 @@ webm_io_available() {
   [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
 }
 
-# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
-vpxdec_available() {
-  [ -n $(vpx_tool_available vpxdec) ] && echo yes
-}
-
-# Wrapper function for running vpxdec in noblit mode. Requires that
-# LIBVPX_BIN_PATH points to the directory containing vpxdec. Positional
-# parameter one is used as the input file path. Positional parameter two, when
-# present, is interpreted as a boolean flag that means the input should be sent
-# to vpxdec via pipe from cat instead of directly.
-vpxdec() {
-  local input="${1}"
-  local pipe_input=${2}
-
-  if [ $# -gt 2 ]; then
-    # shift away $1 and $2 so the remaining arguments can be passed to vpxdec
-    # via $@.
-    shift 2
-  fi
-
-  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-
-  if [ -z "${pipe_input}" ]; then
-    eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" --summary --noblit "$@" \
-        ${devnull}
-  else
-    cat "${input}" \
-        | eval "${VPX_TEST_PREFIX}" "${decoder}" - --summary --noblit "$@" \
-            ${devnull}
-  fi
-}
-
-# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
-vpxenc_available() {
-  [ -n $(vpx_tool_available vpxenc) ] && echo yes
-}
-
-# Wrapper function for running vpxenc. Positional parameters are interpreted as
-# follows:
-#   1 - codec name
-#   2 - input width
-#   3 - input height
-#   4 - number of frames to encode
-#   5 - path to input file
-#   6 - path to output file
-#       Note: The output file path must end in .ivf to output an IVF file.
-#   7 - extra flags
-#       Note: Extra flags currently supports a special case: when set to "-"
-#             input is piped to vpxenc via cat.
-vpxenc() {
-  local encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
-  local codec="${1}"
-  local width=${2}
-  local height=${3}
-  local frames=${4}
-  local input=${5}
-  local output="${VPX_TEST_OUTPUT_DIR}/${6}"
-  local extra_flags=${7}
-
-  # Because --ivf must be within the command line to get IVF from vpxenc.
-  if echo "${output}" | egrep -q 'ivf$'; then
-    use_ivf=--ivf
-  else
-    unset use_ivf
-  fi
-
-  if [ "${extra_flags}" = "-" ]; then
-    pipe_input=yes
-    extra_flags=${8}
-  else
-    unset pipe_input
-  fi
-
-  if [ -z "${pipe_input}" ]; then
-    eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} --width=${width} \
-        --height=${height} --limit=${frames} ${use_ivf} ${extra_flags} \
-        --output="${output}" "${input}" ${devnull}
-  else
-    cat "${input}" \
-        | eval "${VPX_TEST_PREFIX}" "${encoder}" --codec=${codec} \
-            --width=${width} --height=${height} --limit=${frames} ${use_ivf} \
-            ${extra_flags} --output="${output}" - ${devnull}
-  fi
-
-  if [ ! -e "${output}" ]; then
-    # Return non-zero exit status: output file doesn't exist, so something
-    # definitely went wrong.
-    return 1
-  fi
-}
-
 # Filters strings from positional parameter one using the filter specified by
 # positional parameter two. Filter behavior depends on the presence of a third
 # positional parameter. When parameter three is present, strings that match the
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 546977069..7d8118235 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -90,14 +90,14 @@ class VarianceTest
 
     rnd(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-    src_ = new uint8_t[block_size_];
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
     ref_ = new uint8_t[block_size_];
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
-    delete[] src_;
+    vpx_free(src_);
     delete[] ref_;
     libvpx_test::ClearSystemState();
   }
@@ -707,24 +707,7 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 #if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-unsigned int vp9_sub_pixel_variance32x32_avx2(
-    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
-    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_variance64x64_avx2(
-    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
-    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
-    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
-    const uint8_t *second_pred);
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset,
-    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
-    const uint8_t *second_pred);
-}
+
 const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
 const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
 const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
@@ -743,7 +726,7 @@ const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
 const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
     vp9_sub_pixel_variance64x64_avx2;
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_AVX2, VP9SubpelVarianceTest,
+    AVX2, VP9SubpelVarianceTest,
     ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2),
                       make_tuple(6, 6, subpel_variance64x64_avx2)));
 
@@ -752,10 +735,32 @@ const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
 const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
     vp9_sub_pixel_avg_variance64x64_avx2;
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_AVX2, VP9SubpelAvgVarianceTest,
+    AVX2, VP9SubpelAvgVarianceTest,
     ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
                       make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
 #endif  // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9VarianceTest,
+    ::testing::Values(make_tuple(3, 3, variance8x8_neon),
+                      make_tuple(4, 4, variance16x16_neon),
+                      make_tuple(5, 5, variance32x32_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
+    vp9_sub_pixel_variance8x8_neon;
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+    vp9_sub_pixel_variance16x16_neon;
+const vp9_subpixvariance_fn_t subpel_variance32x32_neon =
+    vp9_sub_pixel_variance32x32_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9SubpelVarianceTest,
+    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon),
+                      make_tuple(4, 4, subpel_variance16x16_neon),
+                      make_tuple(5, 5, subpel_variance32x32_neon)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
 
 }  // namespace vp9
diff --git a/test/video_source.h b/test/video_source.h
index 94500b43d..c924f964f 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -10,6 +10,9 @@
 #ifndef TEST_VIDEO_SOURCE_H_
 #define TEST_VIDEO_SOURCE_H_
 
+#if defined(_WIN32)
+#include <windows.h>
+#endif
 #include <cstdio>
 #include <cstdlib>
 #include <string>
@@ -50,14 +53,57 @@ static FILE *OpenTestDataFile(const std::string& file_name) {
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *OpenTestOutFile(const std::string& file_name) {
-  const std::string path_to_source = GetDataPath() + "/" + file_name;
-  return fopen(path_to_source.c_str(), "wb");
+static FILE *GetTempOutFile(std::string *file_name) {
+  file_name->clear();
+#if defined(_WIN32)
+  char fname[MAX_PATH];
+  char tmppath[MAX_PATH];
+  if (GetTempPathA(MAX_PATH, tmppath)) {
+    // Assume for now that the filename generated is unique per process
+    if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
+      file_name->assign(fname);
+      return fopen(fname, "wb+");
+    }
+  }
+  return NULL;
+#else
+  return tmpfile();
+#endif
 }
 
-static FILE *OpenTempOutFile() {
-  return tmpfile();
-}
+class TempOutFile {
+ public:
+  TempOutFile() {
+    file_ = GetTempOutFile(&file_name_);
+  }
+  ~TempOutFile() {
+    CloseFile();
+    if (!file_name_.empty()) {
+      EXPECT_EQ(0, remove(file_name_.c_str()));
+    }
+  }
+  FILE *file() {
+    return file_;
+  }
+  const std::string& file_name() {
+    return file_name_;
+  }
+
+ protected:
+  void CloseFile() {
+    if (file_) {
+      // Close if file pointer is associated with an open file
+#if defined(_WIN32)
+      if (file_->_ptr != NULL) fclose(file_);
+#else
+      if (fileno(file_) != -1) fclose(file_);
+#endif
+      file_ = NULL;
+    }
+  }
+  FILE *file_;
+  std::string file_name_;
+};
 
 // Abstract base class for test video sources, which provide a stream of
 // vpx_image_t images with associated timestamps and duration.
diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh
index a5728f677..6dd5f171b 100755
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -25,12 +25,13 @@ vp9_spatial_svc_encoder_verify_environment() {
 
 # Runs vp9_spatial_svc_encoder. $1 is the test name.
 vp9_spatial_svc_encoder() {
-  local encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder"
-  encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
-  local test_name="$1"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
-  local frames_to_encode="10"
-  local max_kf="9999"
+  local readonly \
+    encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
+  local readonly test_name="$1"
+  local readonly \
+    output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
+  local readonly frames_to_encode=10
+  local readonly max_kf=9999
 
   shift
 
@@ -40,52 +41,32 @@ vp9_spatial_svc_encoder() {
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
-      -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
-      "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
+    -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
+    "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
 
   [ -e "${output_file}" ] || return 1
 }
 
-# Each mode is run with layer count 1-$vp9_ssvc_test_layers.
+# Each test is run with layer count 1-$vp9_ssvc_test_layers.
 vp9_ssvc_test_layers=5
 
-vp9_spatial_svc_mode_i() {
+vp9_spatial_svc() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local test_name="${FUNCNAME}"
+    local readonly test_name="vp9_spatial_svc"
     for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -m i -l ${layers}
+      vp9_spatial_svc_encoder "${test_name}" -l ${layers}
     done
   fi
 }
 
-vp9_spatial_svc_mode_altip() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local test_name="${FUNCNAME}"
-    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -m "alt-ip" -l ${layers}
-    done
-  fi
-}
+readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
+                                DISABLED_vp9_spatial_svc_mode_altip
+                                DISABLED_vp9_spatial_svc_mode_ip
+                                DISABLED_vp9_spatial_svc_mode_gf
+                                vp9_spatial_svc"
 
-vp9_spatial_svc_mode_ip() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local test_name="${FUNCNAME}"
-    vp9_spatial_svc_encoder "${test_name}" -m ip -l 1
-  fi
-}
-
-vp9_spatial_svc_mode_gf() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local test_name="${FUNCNAME}"
-    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -m gf -l ${layers}
-    done
-  fi
-}
-
-vp9_spatial_svc_tests="vp9_spatial_svc_mode_i
-                       vp9_spatial_svc_mode_altip
-                       vp9_spatial_svc_mode_ip
-                       vp9_spatial_svc_mode_gf"
-
-run_tests vp9_spatial_svc_encoder_verify_environment "${vp9_spatial_svc_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
+  run_tests \
+    vp9_spatial_svc_encoder_verify_environment \
+    "${vp9_spatial_svc_tests}"
+fi
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index d7df2867d..fabb43824 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -17,14 +17,14 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-typedef void (*subtract_fn_t)(int rows, int cols,
-                              int16_t *diff_ptr, ptrdiff_t diff_stride,
-                              const uint8_t *src_ptr, ptrdiff_t src_stride,
-                              const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+typedef void (*SubtractFunc)(int rows, int cols,
+                             int16_t *diff_ptr, ptrdiff_t diff_stride,
+                             const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 
 namespace vp9 {
 
-class VP9SubtractBlockTest : public ::testing::TestWithParam<subtract_fn_t> {
+class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
  public:
   virtual void TearDown() {
     libvpx_test::ClearSystemState();
@@ -95,4 +95,9 @@ INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                         ::testing::Values(vp9_subtract_block_sse2));
 #endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
+                        ::testing::Values(vp9_subtract_block_neon));
+#endif
+
 }  // namespace vp9
diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc
new file mode 100644
index 000000000..b3302d942
--- /dev/null
+++ b/test/vpx_scale_test.cc
@@ -0,0 +1,256 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+namespace {
+
+typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
+                              YV12_BUFFER_CONFIG *dst_ybf);
+
+class VpxScaleBase {
+ public:
+  virtual ~VpxScaleBase() {
+    libvpx_test::ClearSystemState();
+  }
+
+  void ResetImage(int width, int height) {
+    width_ = width;
+    height_ = height;
+    vpx_memset(&img_, 0, sizeof(img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+
+    vpx_memset(&ref_img_, 0, sizeof(ref_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
+
+    vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
+                                             VP8BORDERINPIXELS));
+    vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+    ReferenceCopyFrame();
+  }
+
+  void DeallocImage() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
+  }
+
+ protected:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      vpx_memset(left, left[padding], padding);
+      vpx_memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      vpx_memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y <  bottom_extend; ++y) {
+      vpx_memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer,
+                ref_img_.y_crop_width, ref_img_.y_crop_height,
+                ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride,
+                ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer,
+                ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+                ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride,
+                ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG cpy_img_;
+  int width_;
+  int height_;
+};
+
+class ExtendBorderTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+  virtual ~ExtendBorderTest() {}
+
+ protected:
+  virtual void SetUp() {
+    extend_fn_ = GetParam();
+  }
+
+  void ExtendBorder() {
+    ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
+  }
+
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ExtendBorder();
+        ReferenceExtendBorder();
+        CompareImages(img_);
+        DeallocImage();
+      }
+    }
+  }
+
+  ExtendFrameBorderFunc extend_fn_;
+};
+
+TEST_P(ExtendBorderTest, ExtendBorder) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
+                        ::testing::Values(vp8_yv12_extend_frame_borders_c));
+
+class CopyFrameTest
+    : public VpxScaleBase,
+      public ::testing::TestWithParam<CopyFrameFunc> {
+ public:
+  virtual ~CopyFrameTest() {}
+
+ protected:
+  virtual void SetUp() {
+    copy_frame_fn_ = GetParam();
+  }
+
+  void CopyFrame() {
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+  }
+
+  void RunTest() {
+#if ARCH_ARM
+    // Some arm devices OOM when trying to allocate the largest buffers.
+    static const int kNumSizesToTest = 6;
+#else
+    static const int kNumSizesToTest = 7;
+#endif
+    static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+    for (int h = 0; h < kNumSizesToTest; ++h) {
+      for (int w = 0; w < kNumSizesToTest; ++w) {
+        ResetImage(kSizesToTest[w], kSizesToTest[h]);
+        ReferenceCopyFrame();
+        CopyFrame();
+        CompareImages(cpy_img_);
+        DeallocImage();
+      }
+    }
+  }
+
+  CopyFrameFunc copy_frame_fn_;
+};
+
+TEST_P(CopyFrameTest, CopyFrame) {
+  ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
+                        ::testing::Values(vp8_yv12_copy_frame_c));
+}  // namespace
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index 093230b69..836b13cce 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -22,6 +22,32 @@ vpxdec_verify_environment() {
   fi
 }
 
+# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
+vpxdec_available() {
+  [ -n "$(vpx_tool_available vpxdec)" ] && echo yes
+}
+
+# Wrapper function for running vpxdec with pipe input. Requires that
+# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to vpxdec.
+vpxdec_pipe() {
+  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
+  local input="$1"
+  shift
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
+}
+
+# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to
+# the directory containing vpxdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to vpxdec.
+vpxdec() {
+  local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
+  local input="${1}"
+  shift
+  eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
 vpxdec_can_decode_vp8() {
   if [ "$(vpxdec_available)" = "yes" ] && \
      [ "$(vp8_decode_available)" = "yes" ]; then
@@ -38,20 +64,20 @@ vpxdec_can_decode_vp9() {
 
 vpxdec_vp8_ivf() {
   if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
-    vpxdec "${VP8_IVF_FILE}"
+    vpxdec "${VP8_IVF_FILE}" --summary --noblit
   fi
 }
 
 vpxdec_vp8_ivf_pipe_input() {
   if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
-    vpxdec "${VP8_IVF_FILE}" -
+    vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit
   fi
 }
 
 vpxdec_vp9_webm() {
   if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    vpxdec "${VP9_WEBM_FILE}"
+    vpxdec "${VP9_WEBM_FILE}" --summary --noblit
   fi
 }
 
diff --git a/test/vpxenc.sh b/test/vpxenc.sh
index f08c04878..3cf3f4d40 100755
--- a/test/vpxenc.sh
+++ b/test/vpxenc.sh
@@ -15,7 +15,7 @@
 ##
 . $(dirname $0)/tools_common.sh
 
-TEST_FRAMES=10
+readonly TEST_FRAMES=10
 
 # Environment check: Make sure input is available.
 vpxenc_verify_environment() {
@@ -39,55 +39,172 @@ vpxenc_can_encode_vp9() {
   fi
 }
 
+# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
+vpxenc_available() {
+  [ -n "$(vpx_tool_available vpxenc)" ] && echo yes
+}
+
+# Wrapper function for running vpxenc with pipe input. Requires that
+# LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to vpxenc.
+vpxenc_pipe() {
+  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
+  local readonly input="$1"
+  shift
+  cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - "$@" ${devnull}
+}
+
+# Wrapper function for running vpxenc. Requires that LIBVPX_BIN_PATH points to
+# the directory containing vpxenc. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to vpxenc.
+vpxenc() {
+  local readonly encoder="${LIBVPX_BIN_PATH}/vpxenc${VPX_TEST_EXE_SUFFIX}"
+  local readonly input="${1}"
+  shift
+  eval "${VPX_TEST_PREFIX}" "${encoder}" "$input" "$@" ${devnull}
+}
+
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp8.ivf
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
+    vpxenc --codec=vp8 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}" \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
-vpxenc_vp8_ivf_pipe_input() {
+vpxenc_vp8_ivf_piped_input() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp8.ivf -
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    cat "${YUV_RAW_INPUT}" \
+      | vpxenc --codec=vp8 \
+        --width="${YUV_RAW_INPUT_WIDTH}" \
+        --height="${YUV_RAW_INPUT_HEIGHT}" \
+        --limit="${TEST_FRAMES}" \
+        --ivf \
+        --output="${output}" \
+        -
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
 vpxenc_vp8_webm() {
-  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] &&
-     [ "$(webm_io_available)" = "yes" ] ; then
-    vpxenc vp8 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp8.webm
+  if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    vpxenc --codec=vp8 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --output="${output}" \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp9.ivf
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    vpxenc --codec=vp9 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --test-decode=fatal \
+      --output="${output}" \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
 vpxenc_vp9_webm() {
-  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] &&
-     [ "$(webm_io_available)" = "yes" ] ; then
-    vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp9.webm
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    vpxenc --codec=vp9 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --test-decode=fatal \
+      --output="${output}" \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
-DISABLED_vpxenc_vp9_ivf_lossless() {
+vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    vpxenc vp9 ${YUV_RAW_INPUT_WIDTH} ${YUV_RAW_INPUT_HEIGHT} ${TEST_FRAMES} \
-        "${YUV_RAW_INPUT}" vp9_lossless.ivf --lossless
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    vpxenc --codec=vp9 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}" \
+      --lossless=1 \
+      --test-decode=fatal \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+vpxenc_vp9_ivf_minq0_maxq0() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    vpxenc --codec=vp9 \
+      --width="${YUV_RAW_INPUT_WIDTH}" \
+      --height="${YUV_RAW_INPUT_HEIGHT}" \
+      --limit="${TEST_FRAMES}" \
+      --ivf \
+      --output="${output}" \
+      --min-q=0 \
+      --max-q=0 \
+      --test-decode=fatal \
+      "${YUV_RAW_INPUT}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
   fi
 }
 
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
-              vpxenc_vp8_ivf_pipe_input
+              vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
-              DISABLED_vpxenc_vp9_ivf_lossless"
+              vpxenc_vp9_ivf_lossless
+              vpxenc_vp9_ivf_minq0_maxq0"
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 73ff68308..d4a2ede20 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -24,14 +24,14 @@ static const unsigned int kWidth  = 160;
 static const unsigned int kHeight = 90;
 static const unsigned int kFrames = 10;
 
-typedef struct {
+struct Y4mTestParam {
   const char *filename;
   unsigned int bit_depth;
   vpx_img_fmt format;
   const char *md5raw;
-} test_entry_type;
+};
 
-const test_entry_type kY4mTestVectors[] = {
+const Y4mTestParam kY4mTestVectors[] = {
   {"park_joy_90p_8_420.y4m", 8, VPX_IMG_FMT_I420,
     "e5406275b9fc6bb3436c31d4a05c1cab"},
   {"park_joy_90p_8_422.y4m", 8, VPX_IMG_FMT_I422,
@@ -70,7 +70,7 @@ static void write_image_file(const vpx_image_t *img, FILE *file) {
 }
 
 class Y4mVideoSourceTest
-    : public ::testing::TestWithParam<test_entry_type>,
+    : public ::testing::TestWithParam<Y4mTestParam>,
       public ::libvpx_test::Y4mVideoSource {
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
@@ -126,7 +126,7 @@ class Y4mVideoSourceTest
 };
 
 TEST_P(Y4mVideoSourceTest, SourceTest) {
-  const test_entry_type t = GetParam();
+  const Y4mTestParam t = GetParam();
   Init(t.filename, kFrames);
   HeaderChecks(t.bit_depth, t.format);
   Md5Check(t.md5raw);
@@ -138,9 +138,14 @@ INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
 class Y4mVideoWriteTest
     : public Y4mVideoSourceTest {
  protected:
-  Y4mVideoWriteTest() : Y4mVideoSourceTest() {}
+  Y4mVideoWriteTest() {}
 
-  virtual void ReplaceInputFp(FILE *input_file) {
+  virtual ~Y4mVideoWriteTest() {
+    CloseSource();
+    delete tmpfile_;
+  }
+
+  virtual void ReplaceInputFile(FILE *input_file) {
     CloseSource();
     frame_ = 0;
     input_file_ = input_file;
@@ -153,30 +158,31 @@ class Y4mVideoWriteTest
     ASSERT_TRUE(input_file_ != NULL);
     char buf[Y4M_BUFFER_SIZE] = {0};
     const struct VpxRational framerate = {y4m_.fps_n, y4m_.fps_d};
-    FILE *out_file = libvpx_test::OpenTempOutFile();
-    ASSERT_TRUE(out_file != NULL);
+    tmpfile_ = new libvpx_test::TempOutFile;
+    ASSERT_TRUE(tmpfile_->file() != NULL);
     y4m_write_file_header(buf, sizeof(buf),
                           kWidth, kHeight,
                           &framerate, y4m_.vpx_fmt,
                           y4m_.bit_depth);
-    fputs(buf, out_file);
+    fputs(buf, tmpfile_->file());
     for (unsigned int i = start_; i < limit_; i++) {
       y4m_write_frame_header(buf, sizeof(buf));
-      fputs(buf, out_file);
-      write_image_file(img(), out_file);
+      fputs(buf, tmpfile_->file());
+      write_image_file(img(), tmpfile_->file());
       Next();
     }
-    ReplaceInputFp(out_file);
+    ReplaceInputFile(tmpfile_->file());
   }
 
   virtual void Init(const std::string &file_name, int limit) {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
+  libvpx_test::TempOutFile *tmpfile_;
 };
 
 TEST_P(Y4mVideoWriteTest, WriteTest) {
-  const test_entry_type t = GetParam();
+  const Y4mTestParam t = GetParam();
   Init(t.filename, kFrames);
   HeaderChecks(t.bit_depth, t.format);
   Md5Check(t.md5raw);
diff --git a/third_party/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h
deleted file mode 100644
index 44d438583..000000000
--- a/third_party/libmkv/EbmlIDs.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef MKV_DEFS_HPP
-#define MKV_DEFS_HPP 1
-
-/* Commenting out values not available in webm, but available in matroska */
-
-enum mkv {
-  EBML = 0x1A45DFA3,
-  EBMLVersion = 0x4286,
-  EBMLReadVersion = 0x42F7,
-  EBMLMaxIDLength = 0x42F2,
-  EBMLMaxSizeLength = 0x42F3,
-  DocType = 0x4282,
-  DocTypeVersion = 0x4287,
-  DocTypeReadVersion = 0x4285,
-/* CRC_32 = 0xBF, */
-  Void = 0xEC,
-  SignatureSlot = 0x1B538667,
-  SignatureAlgo = 0x7E8A,
-  SignatureHash = 0x7E9A,
-  SignaturePublicKey = 0x7EA5,
-  Signature = 0x7EB5,
-  SignatureElements = 0x7E5B,
-  SignatureElementList = 0x7E7B,
-  SignedElement = 0x6532,
-  /* segment */
-  Segment = 0x18538067,
-  /* Meta Seek Information */
-  SeekHead = 0x114D9B74,
-  Seek = 0x4DBB,
-  SeekID = 0x53AB,
-  SeekPosition = 0x53AC,
-  /* Segment Information */
-  Info = 0x1549A966,
-/* SegmentUID = 0x73A4, */
-/* SegmentFilename = 0x7384, */
-/* PrevUID = 0x3CB923, */
-/* PrevFilename = 0x3C83AB, */
-/* NextUID = 0x3EB923, */
-/* NextFilename = 0x3E83BB, */
-/* SegmentFamily = 0x4444, */
-/* ChapterTranslate = 0x6924, */
-/* ChapterTranslateEditionUID = 0x69FC, */
-/* ChapterTranslateCodec = 0x69BF, */
-/* ChapterTranslateID = 0x69A5, */
-  TimecodeScale = 0x2AD7B1,
-  Segment_Duration = 0x4489,
-  DateUTC = 0x4461,
-/* Title = 0x7BA9, */
-  MuxingApp = 0x4D80,
-  WritingApp = 0x5741,
-  /* Cluster */
-  Cluster = 0x1F43B675,
-  Timecode = 0xE7,
-/* SilentTracks = 0x5854, */
-/* SilentTrackNumber = 0x58D7, */
-/* Position = 0xA7, */
-  PrevSize = 0xAB,
-  BlockGroup = 0xA0,
-  Block = 0xA1,
-/* BlockVirtual = 0xA2, */
-  BlockAdditions = 0x75A1,
-  BlockMore = 0xA6,
-  BlockAddID = 0xEE,
-  BlockAdditional = 0xA5,
-  BlockDuration = 0x9B,
-/* ReferencePriority = 0xFA, */
-  ReferenceBlock = 0xFB,
-/* ReferenceVirtual = 0xFD, */
-/* CodecState = 0xA4, */
-/* Slices = 0x8E, */
-/* TimeSlice = 0xE8, */
-  LaceNumber = 0xCC,
-/* FrameNumber = 0xCD, */
-/* BlockAdditionID = 0xCB, */
-/* MkvDelay = 0xCE, */
-/* Cluster_Duration = 0xCF, */
-  SimpleBlock = 0xA3,
-/* EncryptedBlock = 0xAF, */
-  /* Track */
-  Tracks = 0x1654AE6B,
-  TrackEntry = 0xAE,
-  TrackNumber = 0xD7,
-  TrackUID = 0x73C5,
-  TrackType = 0x83,
-  FlagEnabled = 0xB9,
-  FlagDefault = 0x88,
-  FlagForced = 0x55AA,
-  FlagLacing = 0x9C,
-/* MinCache = 0x6DE7, */
-/* MaxCache = 0x6DF8, */
-  DefaultDuration = 0x23E383,
-/* TrackTimecodeScale = 0x23314F, */
-/* TrackOffset = 0x537F, */
-  MaxBlockAdditionID = 0x55EE,
-  Name = 0x536E,
-  Language = 0x22B59C,
-  CodecID = 0x86,
-  CodecPrivate = 0x63A2,
-  CodecName = 0x258688,
-/* AttachmentLink = 0x7446, */
-/* CodecSettings = 0x3A9697, */
-/* CodecInfoURL = 0x3B4040, */
-/* CodecDownloadURL = 0x26B240, */
-/* CodecDecodeAll = 0xAA, */
-/* TrackOverlay = 0x6FAB, */
-/* TrackTranslate = 0x6624, */
-/* TrackTranslateEditionUID = 0x66FC, */
-/* TrackTranslateCodec = 0x66BF, */
-/* TrackTranslateTrackID = 0x66A5, */
-  /* video */
-  Video = 0xE0,
-  FlagInterlaced = 0x9A,
-  StereoMode = 0x53B8,
-  AlphaMode = 0x53C0,
-  PixelWidth = 0xB0,
-  PixelHeight = 0xBA,
-  PixelCropBottom = 0x54AA,
-  PixelCropTop = 0x54BB,
-  PixelCropLeft = 0x54CC,
-  PixelCropRight = 0x54DD,
-  DisplayWidth = 0x54B0,
-  DisplayHeight = 0x54BA,
-  DisplayUnit = 0x54B2,
-  AspectRatioType = 0x54B3,
-/* ColourSpace = 0x2EB524, */
-/* GammaValue = 0x2FB523, */
-  FrameRate = 0x2383E3,
-  /* end video */
-  /* audio */
-  Audio = 0xE1,
-  SamplingFrequency = 0xB5,
-  OutputSamplingFrequency = 0x78B5,
-  Channels = 0x9F,
-/* ChannelPositions = 0x7D7B, */
-  BitDepth = 0x6264,
-  /* end audio */
-  /* content encoding */
-/* ContentEncodings = 0x6d80, */
-/* ContentEncoding = 0x6240, */
-/* ContentEncodingOrder = 0x5031, */
-/* ContentEncodingScope = 0x5032, */
-/* ContentEncodingType = 0x5033, */
-/* ContentCompression = 0x5034, */
-/* ContentCompAlgo = 0x4254, */
-/* ContentCompSettings = 0x4255, */
-/* ContentEncryption = 0x5035, */
-/* ContentEncAlgo = 0x47e1, */
-/* ContentEncKeyID = 0x47e2, */
-/* ContentSignature = 0x47e3, */
-/* ContentSigKeyID = 0x47e4, */
-/* ContentSigAlgo = 0x47e5, */
-/* ContentSigHashAlgo = 0x47e6, */
-  /* end content encoding */
-  /* Cueing Data */
-  Cues = 0x1C53BB6B,
-  CuePoint = 0xBB,
-  CueTime = 0xB3,
-  CueTrackPositions = 0xB7,
-  CueTrack = 0xF7,
-  CueClusterPosition = 0xF1,
-  CueBlockNumber = 0x5378
-/* CueCodecState = 0xEA, */
-/* CueReference = 0xDB, */
-/* CueRefTime = 0x96, */
-/* CueRefCluster = 0x97, */
-/* CueRefNumber = 0x535F, */
-/* CueRefCodecState = 0xEB, */
-  /* Attachment */
-/* Attachments = 0x1941A469, */
-/* AttachedFile = 0x61A7, */
-/* FileDescription = 0x467E, */
-/* FileName = 0x466E, */
-/* FileMimeType = 0x4660, */
-/* FileData = 0x465C, */
-/* FileUID = 0x46AE, */
-/* FileReferral = 0x4675, */
-  /* Chapters */
-/* Chapters = 0x1043A770, */
-/* EditionEntry = 0x45B9, */
-/* EditionUID = 0x45BC, */
-/* EditionFlagHidden = 0x45BD, */
-/* EditionFlagDefault = 0x45DB, */
-/* EditionFlagOrdered = 0x45DD, */
-/* ChapterAtom = 0xB6, */
-/* ChapterUID = 0x73C4, */
-/* ChapterTimeStart = 0x91, */
-/* ChapterTimeEnd = 0x92, */
-/* ChapterFlagHidden = 0x98, */
-/* ChapterFlagEnabled = 0x4598, */
-/* ChapterSegmentUID = 0x6E67, */
-/* ChapterSegmentEditionUID = 0x6EBC, */
-/* ChapterPhysicalEquiv = 0x63C3, */
-/* ChapterTrack = 0x8F, */
-/* ChapterTrackNumber = 0x89, */
-/* ChapterDisplay = 0x80, */
-/* ChapString = 0x85, */
-/* ChapLanguage = 0x437C, */
-/* ChapCountry = 0x437E, */
-/* ChapProcess = 0x6944, */
-/* ChapProcessCodecID = 0x6955, */
-/* ChapProcessPrivate = 0x450D, */
-/* ChapProcessCommand = 0x6911, */
-/* ChapProcessTime = 0x6922, */
-/* ChapProcessData = 0x6933, */
-  /* Tagging */
-/* Tags = 0x1254C367, */
-/* Tag = 0x7373, */
-/* Targets = 0x63C0, */
-/* TargetTypeValue = 0x68CA, */
-/* TargetType = 0x63CA, */
-/* Tagging_TrackUID = 0x63C5, */
-/* Tagging_EditionUID = 0x63C9, */
-/* Tagging_ChapterUID = 0x63C4, */
-/* AttachmentUID = 0x63C6, */
-/* SimpleTag = 0x67C8, */
-/* TagName = 0x45A3, */
-/* TagLanguage = 0x447A, */
-/* TagDefault = 0x4484, */
-/* TagString = 0x4487, */
-/* TagBinary = 0x4485, */
-};
-#endif
diff --git a/third_party/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c
deleted file mode 100644
index 27cfe861c..000000000
--- a/third_party/libmkv/EbmlWriter.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "EbmlWriter.h"
-#include <stdlib.h>
-#include <wchar.h>
-#include <string.h>
-#include <limits.h>
-#if defined(_MSC_VER)
-#define LITERALU64(n) n
-#else
-#define LITERALU64(n) n##LLU
-#endif
-
-void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
-  /* TODO check and make sure we are not > than 0x0100000000000000LLU */
-  unsigned char size = 8; /* size in bytes to output */
-
-  /* mask to compare for byte size */
-  int64_t minVal = 0xff;
-
-  for (size = 1; size < 8; size ++) {
-    if (val < minVal)
-      break;
-
-    minVal = (minVal << 7);
-  }
-
-  val |= (((uint64_t)0x80) << ((size - 1) * 7));
-
-  Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
-}
-
-void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
-  const size_t size_ = strlen(str);
-  const uint64_t  size = size_;
-  Ebml_WriteLen(glob, size);
-  /* TODO: it's not clear from the spec whether the nul terminator
-   * should be serialized too.  For now we omit the null terminator.
-   */
-  Ebml_Write(glob, str, (unsigned long)size);
-}
-
-void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
-  const size_t strlen = wcslen(wstr);
-
-  /* TODO: it's not clear from the spec whether the nul terminator
-   * should be serialized too.  For now we include it.
-   */
-  const uint64_t  size = strlen;
-
-  Ebml_WriteLen(glob, size);
-  Ebml_Write(glob, wstr, (unsigned long)size);
-}
-
-void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
-  int len;
-
-  if (class_id >= 0x01000000)
-    len = 4;
-  else if (class_id >= 0x00010000)
-    len = 3;
-  else if (class_id >= 0x00000100)
-    len = 2;
-  else
-    len = 1;
-
-  Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
-}
-
-void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
-  unsigned char sizeSerialized = 8 | 0x80;
-  Ebml_WriteID(glob, class_id);
-  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
-  Ebml_Serialize(glob, &ui, sizeof(ui), 8);
-}
-
-void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
-  unsigned char size = 8; /* size in bytes to output */
-  unsigned char sizeSerialized = 0;
-  unsigned long minVal;
-
-  Ebml_WriteID(glob, class_id);
-  minVal = 0x7fLU; /* mask to compare for byte size */
-
-  for (size = 1; size < 4; size ++) {
-    if (ui < minVal) {
-      break;
-    }
-
-    minVal <<= 7;
-  }
-
-  sizeSerialized = 0x80 | size;
-  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
-  Ebml_Serialize(glob, &ui, sizeof(ui), size);
-}
-/* TODO: perhaps this is a poor name for this id serializer helper function */
-void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
-  int size;
-  for (size = 4; size > 1; size--) {
-    if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
-      break;
-  }
-  Ebml_WriteID(glob, class_id);
-  Ebml_WriteLen(glob, size);
-  Ebml_WriteID(glob, bin);
-}
-
-void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
-  unsigned char len = 0x88;
-
-  Ebml_WriteID(glob, class_id);
-  Ebml_Serialize(glob, &len, sizeof(len), 1);
-  Ebml_Serialize(glob,  &d, sizeof(d), 8);
-}
-
-void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
-  signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
-  Ebml_Serialize(glob, &out, sizeof(out), 3);
-}
-
-void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
-  Ebml_WriteID(glob, class_id);
-  Ebml_WriteString(glob, s);
-}
-
-void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
-  Ebml_WriteID(glob,  class_id);
-  Ebml_WriteUTF8(glob,  s);
-}
-
-void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
-  Ebml_WriteID(glob, class_id);
-  Ebml_WriteLen(glob, data_length);
-  Ebml_Write(glob,  data, data_length);
-}
-
-void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
-  unsigned char tmp = 0;
-  unsigned long i = 0;
-
-  Ebml_WriteID(glob, 0xEC);
-  Ebml_WriteLen(glob, vSize);
-
-  for (i = 0; i < vSize; i++) {
-    Ebml_Write(glob, &tmp, 1);
-  }
-}
-
-/* TODO Serialize Date */
diff --git a/third_party/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h
deleted file mode 100644
index b94f75733..000000000
--- a/third_party/libmkv/EbmlWriter.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef EBMLWRITER_HPP
-#define EBMLWRITER_HPP
-#include <stddef.h>
-#include "vpx/vpx_integer.h"
-
-/* note: you must define write and serialize functions as well as your own
- * EBML_GLOBAL
- *
- * These functions MUST be implemented
- */
-
-typedef struct EbmlGlobal EbmlGlobal;
-void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
-void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
-
-/*****/
-
-void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
-void Ebml_WriteString(EbmlGlobal *glob, const char *str);
-void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
-void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
-void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
-void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
-void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
-void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
-/* TODO make this more generic to signed */
-void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
-void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
-void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
-void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
-void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
-/* TODO need date function */
-#endif
diff --git a/third_party/libwebm/PATENTS.TXT b/third_party/libwebm/PATENTS.TXT
index 4414d8385..79d17d7d6 100644
--- a/third_party/libwebm/PATENTS.TXT
+++ b/third_party/libwebm/PATENTS.TXT
@@ -1,22 +1,23 @@
 Additional IP Rights Grant (Patents)
+------------------------------------
 
-"This implementation" means the copyrightable works distributed by
-Google as part of the WebM Project.
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
 
-Google hereby grants to you a perpetual, worldwide, non-exclusive,
-no-charge, royalty-free, irrevocable (except as stated in this section)
-patent license to make, have made, use, offer to sell, sell, import,
-transfer, and otherwise run, modify and propagate the contents of this
-implementation of VP8, where such license applies only to those patent
-claims, both currently owned by Google and acquired in the future,
-licensable by Google that are necessarily infringed by this
-implementation of VP8. This grant does not include claims that would be
-infringed only as a consequence of further modification of this
-implementation. If you or your agent or exclusive licensee institute or
-order or agree to the institution of patent litigation against any
-entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that this implementation of VP8 or any code incorporated within this
-implementation of VP8 constitutes direct or contributory patent
-infringement, or inducement of patent infringement, then any patent
-rights granted to you under this License for this implementation of VP8
-shall terminate as of the date such litigation is filed.
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitutes direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index dd368dcbd..fa5b498ca 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1005
+Version: 1041
 License: BSD
 License File: LICENSE
 
@@ -13,5 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
-Modified the original scaler code minimally with include file changes to fit
-in our current build system.
\ No newline at end of file
+None.
diff --git a/third_party/libyuv/include/libyuv/compare.h b/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 000000000..5dfac7c86
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 000000000..1bd45c837
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,254 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 000000000..a18014ca2
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,225 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing Q420.
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 (grey) to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB_Reference
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height);
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+//                const uint8* src_yuy2, int src_stride_yuy2,
+//                uint8* dst_argb, int dst_stride_argb,
+//                int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 000000000..b1cf57f7d
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,173 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 000000000..90f43af04
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,166 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h
index fd6276b49..dc858a814 100644
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
 #define INCLUDE_LIBYUV_CPU_ID_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/format_conversion.h b/third_party/libyuv/include/libyuv/format_conversion.h
new file mode 100644
index 000000000..b18bf0534
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/format_conversion.h
@@ -0,0 +1,168 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert Bayer RGB formats to I420.
+LIBYUV_API
+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+LIBYUV_API
+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_y, int dst_stride_y,
+                    uint8* dst_u, int dst_stride_u,
+                    uint8* dst_v, int dst_stride_v,
+                    int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
+    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
+
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height,
+                uint32 src_fourcc_bayer);
+
+// Convert I420 to Bayer RGB formats.
+LIBYUV_API
+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+LIBYUV_API
+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
+                    const uint8* src_u, int src_stride_u,
+                    const uint8* src_v, int src_stride_v,
+                    uint8* dst_frame, int dst_stride_frame,
+                    int width, int height);
+
+// Temporary API mapper.
+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
+    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
+
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height,
+                uint32 dst_fourcc_bayer);
+
+// Convert Bayer RGB formats to ARGB.
+LIBYUV_API
+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+LIBYUV_API
+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height,
+                uint32 src_fourcc_bayer);
+
+// Converts ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+LIBYUV_API
+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_bayer, int dst_stride_bayer,
+                    int width, int height);
+
+// Temporary API mapper.
+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 000000000..82fd95df2
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,193 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  LIBYUV_BOOL DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h
index 43f8df36d..d10a16985 100644
--- a/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -11,11 +11,11 @@
 #ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 // TODO(fbarchard): Remove the following headers includes.
-// #include "convert.h"
-// #include "convert_argb.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/rotate.h b/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 000000000..8af60b895
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/rotate_argb.h b/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 000000000..660ff5573
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index daf5a45e1..fdfe1ae35 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -13,7 +13,11 @@
 
 #include <stdlib.h>  // For malloc.
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
+
+#if defined(__native_client__)
+#include "ppapi/c/pp_macros.h"  // For PPAPI_RELEASE
+#endif
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -38,7 +42,8 @@ extern "C" {
   var = 0
 
 #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+    defined(TARGET_IPHONE_SIMULATOR) || \
+    (defined(_MSC_VER) && defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -47,7 +52,12 @@ extern "C" {
 #endif
 
 // Enable for NaCL pepper 33 for bundle and AVX2 support.
-//  #define NEW_BINUTILS
+#if defined(__native_client__) && PPAPI_RELEASE >= 33
+#define NEW_BINUTILS
+#endif
+#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+#define LIBYUV_DISABLE_NEON
+#endif
 
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
@@ -152,6 +162,11 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 #endif
 
+// The following are available on x64 Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
 // GCC >= 4.7.0 required for AVX2.
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -235,6 +250,10 @@ extern "C" {
 #define HAS_MIRRORROW_SSE2
 #endif
 
+// The following are available on arm64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#endif
+
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -330,7 +349,8 @@ extern "C" {
 #endif
 
 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_I422TOABGRROW_MIPS_DSPR2
@@ -426,7 +446,7 @@ typedef uint8 uvec8[16];
     "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
     #opcode " (%%r15,%%r14),%" #arg "\n" \
     BUNDLEUNLOCK
-#else
+#else  // defined(__native_client__) && defined(__x86_64__)
 #define BUNDLEALIGN "\n"
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMACCESS2(offset, base) #offset "(%" #base ")"
@@ -443,6 +463,15 @@ typedef uint8 uvec8[16];
     #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #define MEMOPARG(opcode, offset, base, index, scale, arg) \
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base) "\n"
+#endif
 #endif
 
 void I444ToARGBRow_NEON(const uint8* src_y,
diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
index 973d46457..a3bc07e0f 100644
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
 #define INCLUDE_LIBYUV_SCALE_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 000000000..0c9b36257
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 5d91f8f76..8dc0762f2 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 000000000..912c4c9e0
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1041
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 000000000..91acc2ffc
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,182 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc
new file mode 100644
index 000000000..9ea81b4e2
--- /dev/null
+++ b/third_party/libyuv/source/compare.cc
@@ -0,0 +1,325 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#if _MSC_VER >= 1700
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif  // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+// Visual C 2012 required for AVX2.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint64 sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)(count) / (double)(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
+
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
+
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 000000000..c546b5182
--- /dev/null
+++ b/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 000000000..5e7b8e443
--- /dev/null
+++ b/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_posix.cc b/third_party/libyuv/source/compare_posix.cc
new file mode 100644
index 000000000..ac361190e
--- /dev/null
+++ b/third_party/libyuv/source/compare_posix.cc
@@ -0,0 +1,158 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );  // NOLINT
+  return sse;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (  // NOLINT
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "sub       $0x10,%1                        \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );  // NOLINT
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 000000000..99831651f
--- /dev/null
+++ b/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,232 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+    align      4
+  wloop:
+    movdqa     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    sub        ecx, 16
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+    align      4
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    sub        ecx, 32
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+    align      4
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    sub        ecx, 16
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33
+
+    align      4
+  wloop:
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
+    lea        eax, [eax + 16]
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    sub        ecx, 16
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
new file mode 100644
index 000000000..874a6cb7c
--- /dev/null
+++ b/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1513 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride,
+                       int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src, 16) &&
+      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+        SplitUVRow = SplitUVRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
+      if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+          IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+          IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+        SplitUVRow = SplitUVRow_MIPS_DSPR2;
+      }
+    }
+  }
+#endif
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  int halfheight = (height + 1) >> 1;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+      int pix) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
+  if (!src_y || !src_yuy2 ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    CopyRow(src_y, dst_y, width);
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int pix) = YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+        UYVYToUVRow = UYVYToUVRow_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          UYVYToYRow = UYVYToYRow_SSE2;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
+        BGRAToUVRow = BGRAToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          BGRAToYRow = BGRAToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+    if (width >= 16) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
+        ABGRToUVRow = ABGRToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ABGRToYRow = ABGRToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+    if (width >= 16) {
+      ABGRToUVRow = ABGRToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ABGRToUVRow = ABGRToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
+      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
+        RGBAToUVRow = RGBAToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          RGBAToYRow = RGBAToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+    if (width >= 16) {
+      RGBAToUVRow = RGBAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGBAToUVRow = RGBAToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+    }
+    if (width >= 16) {
+      RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_RGB24TOYROW_NEON
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RGB24TOYROW_NEON
+
+  for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+    RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+    RGB24ToYRow(src_rgb24, dst_y, width);
+    RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+    RGB24ToARGBRow(src_rgb24, row, width);
+    RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+    src_rgb24 += src_stride_rgb24 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+    RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+    RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+    RGB24ToARGBRow(src_rgb24, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+#endif
+  }
+#if !defined(HAS_RGB24TOYROW_NEON)
+  free_aligned_buffer_64(row);
+#endif
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+    }
+    if (width >= 16) {
+      RAWToUVRow = RAWToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_RAWTOYROW_NEON
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RAWTOYROW_NEON
+
+  for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+    RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+    RAWToYRow(src_raw, dst_y, width);
+    RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+    RAWToARGBRow(src_raw, row, width);
+    RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+    src_raw += src_stride_raw * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+    RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+    RAWToYRow(src_raw, dst_y, width);
+#else
+    RAWToARGBRow(src_raw, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+#endif
+  }
+#if !defined(HAS_RAWTOYROW_NEON)
+  free_aligned_buffer_64(row);
+#endif
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+    }
+    if (width >= 16) {
+      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_RGB565TOYROW_NEON
+
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_RGB565TOYROW_NEON
+
+  for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+    RGB565ToYRow(src_rgb565, dst_y, width);
+    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+    RGB565ToARGBRow(src_rgb565, row, width);
+    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+    src_rgb565 += src_stride_rgb565 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+    RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+    RGB565ToARGBRow(src_rgb565, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+#endif
+  }
+#if !defined(HAS_RGB565TOYROW_NEON)
+  free_aligned_buffer_64(row);
+#endif
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_ARGB1555TOYROW_NEON
+
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+  for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+    ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+    ARGB1555ToYRow(src_argb1555, dst_y, width);
+    ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                   width);
+#else
+    ARGB1555ToARGBRow(src_argb1555, row, width);
+    ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                      width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+    src_argb1555 += src_stride_argb1555 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+    ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+    ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+    ARGB1555ToARGBRow(src_argb1555, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+#endif
+  }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+  free_aligned_buffer_64(row);
+#endif
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_ARGB4444TOYROW_NEON
+
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+  for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+    ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+    ARGB4444ToYRow(src_argb4444, dst_y, width);
+    ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                   width);
+#else
+    ARGB4444ToARGBRow(src_argb4444, row, width);
+    ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                      width);
+    ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+    ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+    src_argb4444 += src_stride_argb4444 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+    ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+    ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+    ARGB4444ToARGBRow(src_argb4444, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
+    ARGBToYRow(row, dst_y, width);
+#endif
+  }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+  free_aligned_buffer_64(row);
+#endif
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 000000000..ac0bc3d15
--- /dev/null
+++ b/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,938 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I444ToARGBRow = I444ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I411ToARGBRow = I411ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+                         uint8* dst_argb, int dst_stride_argb,
+                         int width, int height) {
+  int y;
+  void (*YToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = YToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_YTOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    YToARGBRow = YToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      YToARGBRow = YToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_YTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YToARGBRow = YToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YToARGBRow = YToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I400ToARGBRow = I400ToARGBRow_SSE2;
+      }
+    }
+  }
+#elif defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#elif defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#elif defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int pix) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int pix) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#elif defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  // Posix is 16, Windows is 8.
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  // Posix is 16, Windows is 8.
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 000000000..c1a2f62f0
--- /dev/null
+++ b/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1210 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUVRow_ = MergeUVRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, src_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v || !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+        I422ToBGRARow = I422ToBGRARow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v || !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+        I422ToABGRRow = I422ToABGRRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+        I422ToRGBARow = I422ToRGBARow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#elif defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_raw, int dst_stride_raw,
+                int width, int height) {
+  int y;
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+  if (!src_y || !src_u || !src_v || !dst_raw ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
+  }
+#if defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_SSSE3;
+    }
+  }
+#elif defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#elif defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#elif defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#elif defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGGR:
+      r = I420ToBayerBGGR(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_GBRG:
+      r = I420ToBayerGBRG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_GRBG:
+      r = I420ToBayerGRBG(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_RGGB:
+      r = I420ToBayerRGGB(y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          dst_sample,
+                          dst_sample_stride ? dst_sample_stride : width,
+                          width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420 and Q420.
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 000000000..121a41611
--- /dev/null
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1113 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
+        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+        }
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+      }
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 32) {
+      ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUV411Row = ARGBToUV411Row_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  // Allocate a rows of uv.
+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUVRow_ = MergeUVRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  free_aligned_buffer_64(row_u);
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  // Allocate a rows of uv.
+  align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+  uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVRow = ARGBToUVRow_SSSE3;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          ARGBToYRow = ARGBToYRow_SSSE3;
+        }
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+        MergeUVRow_ = MergeUVRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+    MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  free_aligned_buffer_64(row_u);
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_NEON;
+      }
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV422Row = ARGBToUV422Row_NEON;
+      }
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#elif defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#elif defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#elif defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+        ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+        if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+          ARGBToYJRow = ARGBToYJRow_SSSE3;
+        }
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVJRow = ARGBToUVJRow_NEON;
+      }
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+          IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+        ARGBToYJRow = ARGBToYJRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_jpeg.cc b/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 000000000..bcb980f7f
--- /dev/null
+++ b/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_to_argb.cc b/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 000000000..1b228a7b4
--- /dev/null
+++ b/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,327 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * abs_crop_height * 4;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToARGB(src, src_width,
+                          crop_argb, argb_stride,
+                          crop_width, inv_crop_height);
+      break;
+
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+//    case FOURCC_Q420:
+//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+//               src_width + crop_x * 2;
+//      r = Q420ToARGB(src, src_width * 3,
+//                    src_uv, src_width * 3,
+//                    crop_argb, argb_stride,
+//                    crop_width, inv_crop_height);
+//      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_to_i420.cc b/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 000000000..7b194fff7
--- /dev/null
+++ b/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,383 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/format_conversion.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // TODO(fbarchard): Support cropping Bayer by odd numbers
+    // by adjusting fourcc.
+    case FOURCC_BGGR:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerBGGRToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_GBRG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGBRGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_GRBG:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerGRBGToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGGB:
+      src = sample + (src_width * crop_y + crop_x);
+      r = BayerRGGBToI420(src, src_width,
+                          y, y_stride,
+                          u, u_stride,
+                          v, v_stride,
+                          crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_Q420:
+      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+               src_width + crop_x * 2;
+      r = Q420ToI420(src, src_width * 3,
+                    src_uv, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index 520cfe510..2e0d61d20 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "libyuv/cpu_id.h"
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
@@ -27,7 +27,7 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "third_party/libyuv/include/libyuv/basic_types.h"  // For CPU_X86
+#include "libyuv/basic_types.h"  // For CPU_X86
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -48,7 +48,7 @@ extern "C" {
     defined(__i386__) || defined(__x86_64__))
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #if (_MSC_FULL_VER >= 160040219)
   __cpuidex((int*)(cpu_info), info_eax, info_ecx);
 #elif defined(_M_IX86)
@@ -188,10 +188,14 @@ LIBYUV_API SAFEBUFFERS
 int InitCpuFlags(void) {
 #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
 
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
   uint32 cpu_info1[4] = { 0, 0, 0, 0 };
   uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
-  CpuId(7, 0, cpu_info7);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
   cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
@@ -199,6 +203,7 @@ int InitCpuFlags(void) {
               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
               ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
               kCpuHasX86;
+
 #ifdef HAS_XGETBV
   if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
       TestOsSaveYmm()) {  // Saves YMM.
diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc
new file mode 100644
index 000000000..a3daf96a9
--- /dev/null
+++ b/third_party/libyuv/source/format_conversion.cc
@@ -0,0 +1,552 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/format_conversion.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// generate a selector mask useful for pshufb
+static uint32 GenerateSelector(int select0, int select1) {
+  return (uint32)(select0) |
+         (uint32)((select1 + 4) << 8) |
+         (uint32)((select0 + 8) << 16) |
+         (uint32)((select1 + 12) << 24);
+}
+
+static int MakeSelectors(const int blue_index,
+                         const int green_index,
+                         const int red_index,
+                         uint32 dst_fourcc_bayer,
+                         uint32* index_map) {
+  // Now build a lookup table containing the indices for the four pixels in each
+  // 2x2 Bayer grid.
+  switch (dst_fourcc_bayer) {
+    case FOURCC_BGGR:
+      index_map[0] = GenerateSelector(blue_index, green_index);
+      index_map[1] = GenerateSelector(green_index, red_index);
+      break;
+    case FOURCC_GBRG:
+      index_map[0] = GenerateSelector(green_index, blue_index);
+      index_map[1] = GenerateSelector(red_index, green_index);
+      break;
+    case FOURCC_RGGB:
+      index_map[0] = GenerateSelector(red_index, green_index);
+      index_map[1] = GenerateSelector(green_index, blue_index);
+      break;
+    case FOURCC_GRBG:
+      index_map[0] = GenerateSelector(green_index, red_index);
+      index_map[1] = GenerateSelector(blue_index, green_index);
+      break;
+    default:
+      return -1;  // Bad FourCC
+  }
+  return 0;
+}
+
+// Converts 32 bit ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer) {
+  int y;
+  const int blue_index = 0;  // Offsets for ARGB format
+  const int green_index = 1;
+  const int red_index = 2;
+  uint32 index_map[2];
+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix) = ARGBToBayerRow_C;
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+    }
+  }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToBayerRow = ARGBToBayerRow_NEON;
+    }
+  }
+#endif
+  if (MakeSelectors(blue_index, green_index, red_index,
+                    dst_fourcc_bayer, index_map)) {
+    return -1;  // Bad FourCC
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
+    src_argb += src_stride_argb;
+    dst_bayer += dst_stride_bayer;
+  }
+  return 0;
+}
+
+#define AVG(a, b) (((a) + (b)) >> 1)
+
+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_argb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 r = src_bayer1[1];
+  int x;
+  for (x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = src_bayer0[0];
+    dst_argb[1] = AVG(g, src_bayer0[1]);
+    dst_argb[2] = AVG(r, src_bayer1[1]);
+    dst_argb[3] = 255U;
+    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer1[1];
+    dst_argb[7] = 255U;
+    g = src_bayer0[1];
+    r = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_argb += 8;
+  }
+  dst_argb[0] = src_bayer0[0];
+  dst_argb[1] = AVG(g, src_bayer0[1]);
+  dst_argb[2] = AVG(r, src_bayer1[1]);
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer0[0];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer1[1];
+    dst_argb[7] = 255U;
+  }
+}
+
+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_argb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 g = src_bayer0[1];
+  uint8 b = src_bayer1[1];
+  int x;
+  for (x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = AVG(b, src_bayer1[1]);
+    dst_argb[1] = AVG(g, src_bayer0[1]);
+    dst_argb[2] = src_bayer0[0];
+    dst_argb[3] = 255U;
+    dst_argb[4] = src_bayer1[1];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[7] = 255U;
+    g = src_bayer0[1];
+    b = src_bayer1[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_argb += 8;
+  }
+  dst_argb[0] = AVG(b, src_bayer1[1]);
+  dst_argb[1] = AVG(g, src_bayer0[1]);
+  dst_argb[2] = src_bayer0[0];
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer1[1];
+    dst_argb[5] = src_bayer0[1];
+    dst_argb[6] = src_bayer0[0];
+    dst_argb[7] = 255U;
+  }
+}
+
+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_argb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 b = src_bayer0[1];
+  int x;
+  for (x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = AVG(b, src_bayer0[1]);
+    dst_argb[1] = src_bayer0[0];
+    dst_argb[2] = src_bayer1[0];
+    dst_argb[3] = 255U;
+    dst_argb[4] = src_bayer0[1];
+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_argb[7] = 255U;
+    b = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_argb += 8;
+  }
+  dst_argb[0] = AVG(b, src_bayer0[1]);
+  dst_argb[1] = src_bayer0[0];
+  dst_argb[2] = src_bayer1[0];
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer0[1];
+    dst_argb[5] = src_bayer0[0];
+    dst_argb[6] = src_bayer1[0];
+    dst_argb[7] = 255U;
+  }
+}
+
+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
+                       uint8* dst_argb, int pix) {
+  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+  uint8 r = src_bayer0[1];
+  int x;
+  for (x = 0; x < pix - 2; x += 2) {
+    dst_argb[0] = src_bayer1[0];
+    dst_argb[1] = src_bayer0[0];
+    dst_argb[2] = AVG(r, src_bayer0[1]);
+    dst_argb[3] = 255U;
+    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+    dst_argb[6] = src_bayer0[1];
+    dst_argb[7] = 255U;
+    r = src_bayer0[1];
+    src_bayer0 += 2;
+    src_bayer1 += 2;
+    dst_argb += 8;
+  }
+  dst_argb[0] = src_bayer1[0];
+  dst_argb[1] = src_bayer0[0];
+  dst_argb[2] = AVG(r, src_bayer0[1]);
+  dst_argb[3] = 255U;
+  if (!(pix & 1)) {
+    dst_argb[4] = src_bayer1[0];
+    dst_argb[5] = src_bayer0[0];
+    dst_argb[6] = src_bayer0[1];
+    dst_argb[7] = 255U;
+  }
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height,
+                uint32 src_fourcc_bayer) {
+  int y;
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int pix);
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  switch (src_fourcc_bayer) {
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    default:
+      return -1;    // Bad FourCC
+  }
+
+  for (y = 0; y < height - 1; y += 2) {
+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+              dst_argb + dst_stride_argb, width);
+    src_bayer += src_stride_bayer * 2;
+    dst_argb += dst_stride_argb * 2;
+  }
+  if (height & 1) {
+    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+  }
+  return 0;
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height,
+                uint32 src_fourcc_bayer) {
+  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int pix);
+  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+                    uint8* dst_argb, int pix);
+
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    int halfheight;
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+        ARGBToYRow = ARGBToYRow_SSSE3;
+      }
+    }
+  }
+#elif defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+    if (width >= 16) {
+      ARGBToUVRow = ARGBToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUVRow = ARGBToUVRow_NEON;
+      }
+    }
+  }
+#endif
+
+  switch (src_fourcc_bayer) {
+    case FOURCC_BGGR:
+      BayerRow0 = BayerRowBG;
+      BayerRow1 = BayerRowGR;
+      break;
+    case FOURCC_GBRG:
+      BayerRow0 = BayerRowGB;
+      BayerRow1 = BayerRowRG;
+      break;
+    case FOURCC_GRBG:
+      BayerRow0 = BayerRowGR;
+      BayerRow1 = BayerRowBG;
+      break;
+    case FOURCC_RGGB:
+      BayerRow0 = BayerRowRG;
+      BayerRow1 = BayerRowGB;
+      break;
+    default:
+      return -1;  // Bad FourCC
+  }
+
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+    int y;
+    for (y = 0; y < height - 1; y += 2) {
+      BayerRow0(src_bayer, src_stride_bayer, row, width);
+      BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+                row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+      src_bayer += src_stride_bayer * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+      BayerRow0(src_bayer, src_stride_bayer, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
+
+// Convert I420 to Bayer.
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_bayer, int dst_stride_bayer,
+                int width, int height,
+                uint32 dst_fourcc_bayer) {
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix) = ARGBToBayerRow_C;
+  const int blue_index = 0;  // Offsets for ARGB format
+  const int green_index = 1;
+  const int red_index = 2;
+  uint32 index_map[2];
+  // Negative height means invert the image.
+  if (height < 0) {
+    int halfheight;
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+    }
+  }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToBayerRow = ARGBToBayerRow_NEON;
+    }
+  }
+#endif
+
+  if (MakeSelectors(blue_index, green_index, red_index,
+                    dst_fourcc_bayer, index_map)) {
+    return -1;  // Bad FourCC
+  }
+  {
+    // Allocate a row of ARGB.
+    align_buffer_64(row, width * 4);
+    int y;
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row, width);
+      ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
+      dst_bayer += dst_stride_bayer;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+  return 0;
+}
+
+#define MAKEBAYERFOURCC(BAYER)                                                 \
+LIBYUV_API                                                                     \
+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
+                         uint8* dst_y, int dst_stride_y,                       \
+                         uint8* dst_u, int dst_stride_u,                       \
+                         uint8* dst_v, int dst_stride_v,                       \
+                         int width, int height) {                              \
+  return BayerToI420(src_bayer, src_stride_bayer,                              \
+                     dst_y, dst_stride_y,                                      \
+                     dst_u, dst_stride_u,                                      \
+                     dst_v, dst_stride_v,                                      \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
+                       const uint8* src_u, int src_stride_u,                   \
+                       const uint8* src_v, int src_stride_v,                   \
+                       uint8* dst_bayer, int dst_stride_bayer,                 \
+                       int width, int height) {                                \
+  return I420ToBayer(src_y, src_stride_y,                                      \
+                     src_u, src_stride_u,                                      \
+                     src_v, src_stride_v,                                      \
+                     dst_bayer, dst_stride_bayer,                              \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
+                       uint8* dst_bayer, int dst_stride_bayer,                 \
+                       int width, int height) {                                \
+  return ARGBToBayer(src_argb, src_stride_argb,                                \
+                     dst_bayer, dst_stride_bayer,                              \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}                                                                              \
+                                                                               \
+LIBYUV_API                                                                     \
+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
+                         uint8* dst_argb, int dst_stride_argb,                 \
+                         int width, int height) {                              \
+  return BayerToARGB(src_bayer, src_stride_bayer,                              \
+                     dst_argb, dst_stride_argb,                                \
+                     width, height,                                            \
+                     FOURCC_##BAYER);                                          \
+}
+
+MAKEBAYERFOURCC(BGGR)
+MAKEBAYERFOURCC(GBRG)
+MAKEBAYERFOURCC(GRBG)
+MAKEBAYERFOURCC(RGGB)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 000000000..15b0ed88a
--- /dev/null
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,566 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
+    !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo,
+                     long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = (int)(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo,
+                     long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 000000000..23d22d099
--- /dev/null
+++ b/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to validate the jpeg appears intact.
+// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  size_t i;
+  if (sample_size < 64) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+  for (i = sample_size - 2; i > 1;) {
+    if (sample[i] != 0xd9) {
+      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
+        return LIBYUV_TRUE;  // Success: Valid jpeg.
+      }
+      --i;
+    }
+    --i;
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc
index 68b8f46e4..3857008ca 100644
--- a/third_party/libyuv/source/planar_functions.cc
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/planar_functions.h"
+#include "libyuv/planar_functions.h"
 
 #include <string.h>  // for memset()
 
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "libyuv/cpu_id.h"
 #ifdef HAVE_JPEG
-#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h"
+#include "libyuv/mjpeg_decoder.h"
 #endif
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -37,6 +37,10 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
 #if defined(HAS_COPYROW_X86)
   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
     CopyRow = CopyRow_X86;
diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc
new file mode 100644
index 000000000..2ef3228cb
--- /dev/null
+++ b/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,1301 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
+#endif
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+#define HAS_TRANSPOSE_WX8_NEON
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWX8_NEON
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width);
+#endif  // defined(__ARM_NEON__)
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width);
+#endif  // defined(__mips__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_TRANSPOSE_WX8_SSSE3
+__declspec(naked) __declspec(align(16))
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                               uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+__declspec(naked) __declspec(align(16))
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqa    xmm0, [eax]
+    movdqa    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqa    xmm2, [eax]
+    movdqa    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqa    xmm4, [eax]
+    movdqa    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqa    xmm6, [eax]
+    movdqa    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqa    xmm5, [esp]  // restore xmm5
+    movdqa    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqa    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSE_WX8_SSSE3
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                               uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  2                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc"
+  #if defined(__SSE2__)
+      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  #endif
+  );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
+#define HAS_TRANSPOSE_UVWX8_SSE2
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w);
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
+
+"1:                                            \n"
+    "movdqa (%eax),%xmm0                       \n"
+    "movdqa (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqa (%eax),%xmm2                       \n"
+    "movdqa (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqa (%eax),%xmm4                       \n"
+    "movdqa (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqa (%eax),%xmm6                       \n"
+    "movdqa (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqa (%esp),%xmm5                       \n"
+    "movdqa %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqa (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
+    "ret                                       \n"
+#endif
+);
+#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+#define HAS_TRANSPOSE_WX8_FAST_SSSE3
+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
+                                    uint8* dst, int dst_stride, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqa     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqa     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqa     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqa     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqa     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(dst_stride))   // %4
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+);
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                                uint8* dst_a, int dst_stride_a,
+                                uint8* dst_b, int dst_stride_b,
+                                int w) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqa     (%0),%%xmm0                      \n"
+  "movdqa     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqa     (%0),%%xmm2                      \n"
+  "movdqa     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqa     (%0),%%xmm4                      \n"
+  "movdqa     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqa     (%0),%%xmm6                      \n"
+  "movdqa     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(w)   // %3
+  : "r"((intptr_t)(src_stride)),    // %4
+    "r"((intptr_t)(dst_stride_a)),  // %5
+    "r"((intptr_t)(dst_stride_b))   // %6
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+static void TransposeWx8_C(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride,
+                           int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+static void TransposeWxH_C(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride,
+                           int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSE_WX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    TransposeWx8 = TransposeWx8_SSSE3;
+  }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) &&
+      IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    TransposeWx8 = TransposeWx8_FAST_SSSE3;
+  }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
+  }
+
+  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    MirrorRow = MirrorRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    MirrorRow = MirrorRow_SSE2;
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    MirrorRow = MirrorRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    MirrorRow = MirrorRow_AVX2;
+  }
+#endif
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+static void TransposeUVWx8_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+static void TransposeUVWxH_C(const uint8* src, int src_stride,
+                             uint8* dst_a, int dst_stride_a,
+                             uint8* dst_b, int dst_stride_b,
+                             int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(width, 8) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    TransposeUVWx8 = TransposeUVWx8_SSE2;
+  }
+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
+#endif
+
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  TransposeUVWxH_C(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width, i);
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorRowUV = MirrorUVRow_NEON;
+  }
+#elif defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+    MirrorRowUV = MirrorUVRow_SSSE3;
+  }
+#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorRowUV(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_argb.cc b/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 000000000..ab0f9ce07
--- /dev/null
+++ b/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,209 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx,
+                               uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx,
+                            uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride,
+                          int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
+      IS_ALIGNED(src, 4)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride,
+                  int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+    ARGBMirrorRow = ARGBMirrorRow_AVX2;
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+    ARGBMirrorRow = ARGBMirrorRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
+    CopyRow = CopyRow_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    CopyRow = CopyRow_X86;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+    CopyRow = CopyRow_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_mips.cc b/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 000000000..70770fd06
--- /dev/null
+++ b/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,485 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride,
+                             int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride,
+                                  int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 000000000..d354e11fa
--- /dev/null
+++ b/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,533 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %5, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %5, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %5, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
+
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),          // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
+  );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %7, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %7, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %7, #4                        \n"
+    "blt         2f                            \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
+
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),            // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 27a0de119..97ef84417 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
-#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -35,10 +35,12 @@ extern "C" {
     }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
-     0, 4, 7)
 YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
      1, 4, 7)
+#endif  // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I444TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+     0, 4, 7)
 YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
      2, 4, 7)
 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
@@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
 YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
 YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
 YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
-#endif  // HAS_I422TOARGBROW_SSSE3
+#endif  // HAS_I444TOARGBROW_SSSE3
 #ifdef HAS_I422TOARGBROW_AVX2
 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
 #endif  // HAS_I422TOARGBROW_AVX2
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
index ceb3836cd..fa2b752a2 100644
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -8,11 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #include <string.h>  // For memcpy and memset.
 
-#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "libyuv/basic_types.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc
index a804670d3..ae9370c1b 100644
--- a/third_party/libyuv/source/row_mips.cc
+++ b/third_party/libyuv/source/row_mips.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -16,7 +16,8 @@ extern "C" {
 #endif
 
 // The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 #ifdef HAS_COPYROW_MIPS
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@@ -376,7 +377,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
 
 // MIPS DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2)
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
   __asm__ __volatile__ (
diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc
index c5ae2c583..1392cf5fc 100644
--- a/third_party/libyuv/source/row_neon.cc
+++ b/third_party/libyuv/source/row_neon.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -20,34 +20,46 @@ extern "C" {
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.32    {d2[1]}, [%2]!                 \n"
 
 // Read 8 Y, 2 U and 2 V from 422
 #define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.16    {d2[1]}, [%2]!                 \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
     "vzip.u8    d2, d3                         \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
     "vld1.8     {d3}, [%2]!                    \n"                             \
     "vpaddl.u8  q1, q1                         \n"                             \
     "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
     "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d2, d3                         \n"                             \
@@ -55,7 +67,9 @@ extern "C" {
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
+    MEMACCESS(0)                                                               \
     "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
     "vld1.8     {d2}, [%1]!                    \n"                             \
     "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
     "vuzp.u8    d3, d2                         \n"                             \
@@ -63,6 +77,7 @@ extern "C" {
 
 // Read 8 YUY2
 #define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
     "vld2.8     {d0, d2}, [%0]!                \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
     "vuzp.u8    d2, d3                         \n"                             \
@@ -70,6 +85,7 @@ extern "C" {
 
 // Read 8 UYVY
 #define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
     "vld2.8     {d2, d3}, [%0]!                \n"                             \
     "vmov.u8    d0, d3                         \n"                             \
     "vmov.u8    d3, d2                         \n"                             \
@@ -113,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -124,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -144,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -155,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -175,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -186,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -206,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y,
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -218,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
     "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -238,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y,
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -250,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -270,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y,
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -281,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
     "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -301,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -311,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
     READYUV422
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
     "vst3.8     {d20, d21, d22}, [%3]!         \n"
     "bgt        1b                             \n"
     : "+r"(src_y),      // %0
@@ -331,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y,
                        uint8* dst_raw,
                        int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -342,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
     "vst3.8     {d20, d21, d22}, [%3]!         \n"
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -374,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -385,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %4, %4, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -420,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -432,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
     ARGBTOARGB1555
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -461,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
+    MEMACCESS(5)
     "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
     "vld1.8     {d25}, [%6]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -474,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
     "subs       %4, %4, #8                     \n"
     "vmov.u8    d23, #255                      \n"
     ARGBTOARGB4444
+    MEMACCESS(3)
     "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
     "bgt        1b                             \n"
     : "+r"(src_y),    // %0
@@ -492,7 +541,9 @@ void YToARGBRow_NEON(const uint8* src_y,
                      uint8* dst_argb,
                      int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -503,6 +554,7 @@ void YToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -522,10 +574,12 @@ void I400ToARGBRow_NEON(const uint8* src_y,
     ".p2align   2                              \n"
     "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d20}, [%0]!                   \n"
     "vmov       d21, d20                       \n"
     "vmov       d22, d20                       \n"
     "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -541,7 +595,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -552,6 +608,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -570,7 +627,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -581,6 +640,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
     "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -599,7 +659,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -610,6 +672,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -628,7 +691,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {d25}, [%5]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -639,6 +704,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
     YUV422TORGB
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
     : "+r"(src_y),     // %0
@@ -656,7 +722,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -667,6 +735,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_yuy2),  // %0
@@ -683,7 +752,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
     "vld1.8     {d25}, [%4]                    \n"
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
@@ -694,6 +765,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
     YUV422TORGB
     "subs       %2, %2, #8                     \n"
     "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
     "bgt        1b                             \n"
     : "+r"(src_uyvy),  // %0
@@ -712,9 +784,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
     "vst1.8     {q1}, [%2]!                    \n"  // store V
     "bgt        1b                             \n"
     : "+r"(src_uv),  // %0
@@ -732,9 +807,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
     "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
     "bgt        1b                             \n"
     :
@@ -752,8 +830,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -770,6 +850,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
     "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
     "1:                                        \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
     "vst1.8    {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
@@ -798,10 +879,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #16                        \n"  // 16 pixels per loop.
     "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -822,10 +906,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
     "subs       %3, #8                         \n"  // 8 pixels per loop.
     "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),  // %0
@@ -846,10 +933,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #4                         \n"  // 4 pixels per loop.
     "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src),   // %0
@@ -865,8 +955,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     "vmov.u8    d4, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -882,9 +974,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
     "vmov.u8    d4, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_raw),   // %0
@@ -912,9 +1006,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -958,9 +1054,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -987,9 +1085,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
     "vmov.u8    d3, #255                       \n"  // Alpha
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -1004,8 +1104,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1020,9 +1122,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
     "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1037,8 +1141,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
@@ -1053,8 +1159,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
@@ -1070,9 +1178,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
     "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),  // %0
@@ -1089,9 +1200,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
     "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),  // %0
@@ -1109,12 +1223,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
     "add        %1, %0, %1                     \n"  // stride + src_yuy2
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
     "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
     "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
     "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_yuy2),     // %0
@@ -1133,12 +1251,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
     "add        %1, %0, %1                     \n"  // stride + src_uyvy
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
     "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
     "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
     "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
     "bgt        1b                             \n"
   : "+r"(src_uyvy),     // %0
@@ -1157,10 +1279,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
     // change the stride to row 2 pointer
     "add        %1, %0                         \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
     "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_uv),         // %0
@@ -1178,11 +1303,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
   asm volatile (
     "vmov.u32   d6[0], %3                      \n"  // selector
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
     "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
     "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
     "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    MEMACCESS(1)
     "vst1.8     {d4}, [%1]!                    \n"  // store 8.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1198,8 +1325,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                            uint32 /*selector*/, int pix) {
   asm volatile (
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -1214,12 +1343,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // shuffler
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
     "subs       %2, %2, #4                     \n"  // 4 processed per loop
     "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
     "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store 4.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1237,10 +1369,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
     "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
@@ -1260,10 +1396,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
     "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
     "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_y),     // %0
@@ -1280,9 +1420,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTORGB565
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1298,9 +1440,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB1555
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1317,9 +1461,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB4444
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
     "bgt        1b                             \n"
   : "+r"(src_argb),      // %0
@@ -1338,6 +1484,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -1345,6 +1492,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1362,12 +1510,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
     "vmlal.u8   q2, d1, d25                    \n"  // G
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1390,6 +1540,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -1405,7 +1556,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
 
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1429,7 +1582,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
 
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
@@ -1450,7 +1605,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
 
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1475,12 +1632,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
     "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
     "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1508,7 +1669,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
     "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1547,12 +1710,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1564,7 +1731,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1591,12 +1760,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1608,7 +1781,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1634,12 +1809,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1651,7 +1830,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1677,12 +1858,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1694,7 +1879,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1720,12 +1907,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1737,7 +1928,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1763,12 +1956,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1780,7 +1977,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1806,12 +2005,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1823,7 +2026,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1850,22 +2055,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -1887,7 +2096,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -1914,22 +2125,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -1951,7 +2166,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -1978,22 +2195,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
 
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
     "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
     "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
     "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
@@ -2015,7 +2236,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
     "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
     "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
     "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -2037,6 +2260,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
@@ -2045,6 +2269,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgb565),  // %0
@@ -2063,6 +2288,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
@@ -2071,6 +2297,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb1555),  // %0
@@ -2089,6 +2316,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
@@ -2097,6 +2325,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
     "vmlal.u8   q2, d2, d26                    \n"  // R
     "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_argb4444),  // %0
@@ -2115,6 +2344,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d1, d4                     \n"  // R
@@ -2122,6 +2352,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
     "vmlal.u8   q8, d3, d6                     \n"  // B
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -2140,6 +2371,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // R
@@ -2147,6 +2379,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
     "vmlal.u8   q8, d2, d6                     \n"  // B
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -2165,6 +2398,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d1, d4                     \n"  // B
@@ -2172,6 +2406,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
     "vmlal.u8   q8, d3, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -2190,6 +2425,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // B
@@ -2197,6 +2433,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
     "vmlal.u8   q8, d2, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -2215,6 +2452,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q8, d0, d4                     \n"  // B
@@ -2222,6 +2460,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
     "vmlal.u8   q8, d2, d6                     \n"  // R
     "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
     "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
     "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -2252,7 +2491,9 @@ void InterpolateRow_NEON(uint8* dst_ptr,
     "vdup.8     d4, %4                         \n"
     // General purpose row blend.
   "1:                                          \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vmull.u8   q13, d0, d4                    \n"
@@ -2261,46 +2502,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
     "vmlal.u8   q14, d3, d5                    \n"
     "vrshrn.u16 d0, q13, #8                    \n"
     "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        1b                             \n"
     "b          99f                            \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        25b                            \n"
     "b          99f                            \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q1}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        50b                            \n"
     "b          99f                            \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
     "vld1.8     {q0}, [%2]!                    \n"
     "subs       %3, %3, #16                    \n"
     "vrhadd.u8  q0, q1                         \n"
     "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        75b                            \n"
     "b          99f                            \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
+    MEMACCESS(1)
     "vld1.8     {q0}, [%1]!                    \n"
     "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
     "vst1.8     {q0}, [%0]!                    \n"
     "bgt        100b                           \n"
 
@@ -2323,7 +2576,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "blt        89f                            \n"
     // Blend 8 pixels.
   "8:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q10, d4, d3                    \n"  // db * a
@@ -2337,6 +2592,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "vqadd.u8   q0, q0, q2                     \n"  // + sbg
     "vqadd.u8   d2, d2, d6                     \n"  // + sr
     "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
     "bge        8b                             \n"
 
@@ -2346,7 +2602,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 
     // Blend 1 pixels.
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
     "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
     "subs       %3, %3, #1                     \n"  // 1 processed per loop.
     "vmull.u8   q10, d4, d3                    \n"  // db * a
@@ -2360,6 +2618,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "vqadd.u8   q0, q0, q2                     \n"  // + sbg
     "vqadd.u8   d2, d2, d6                     \n"  // + sr
     "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
     "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
     "bge        1b                             \n"
 
@@ -2379,6 +2638,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     // Attenuate 8 pixels.
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q10, d0, d3                    \n"  // b * a
@@ -2387,6 +2647,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
     "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
     "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
     "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -2410,6 +2671,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
@@ -2427,6 +2689,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
     "vqmovn.u16 d0, q0                         \n"
     "vqmovn.u16 d2, q1                         \n"
     "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
     "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(dst_argb),       // %0
@@ -2451,6 +2714,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
@@ -2465,6 +2729,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
     "vqmovn.u16 d22, q11                       \n"
     "vqmovn.u16 d24, q12                       \n"
     "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
     "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
     "bgt        1b                             \n"
   : "+r"(src_argb),       // %0
@@ -2485,6 +2750,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d24                    \n"  // B
@@ -2493,6 +2759,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
     "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
     "vmov       d1, d0                         \n"  // G
     "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
     "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -2520,6 +2787,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
     "vmov.u8    d30, #50                       \n"  // BR coefficient
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
@@ -2534,6 +2802,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
     "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
     "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
     "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
     "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(dst_argb),  // %0
@@ -2550,12 +2819,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
                              const int8* matrix_argb, int width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
     "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
 
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
@@ -2594,6 +2865,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
     "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
     "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
     "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
     "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
@@ -2614,7 +2886,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vmull.u8   q0, d0, d1                     \n"  // multiply B
@@ -2625,6 +2899,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
     "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
     "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2645,11 +2920,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   q0, q0, q2                     \n"  // add B, G
     "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2669,11 +2947,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
     "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
 
@@ -2698,12 +2979,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   d0, d0, d1                     \n"  // add
     "vmov.u8    d1, d0                         \n"
     "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2722,10 +3006,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     // 16 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
     "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
     "subs       %3, %3, #16                    \n"  // 16 processed per loop.
     "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2749,10 +3036,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     // 8 pixel loop.
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
@@ -2773,21 +3063,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
     "vld1.8     {d1}, [%0],%6                  \n"
     "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
     "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%6                  \n"
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
     "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
     "vld1.8     {d3}, [%2],%6                  \n"
     "subs       %4, %4, #8                     \n"  // 8 pixels
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vabs.s16   q0, q0                         \n"
     "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
     "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
@@ -2810,21 +3107,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1],%4                  \n"
     "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%4                  \n"
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1],%5                  \n"
     "subs       %3, %3, #8                     \n"  // 8 pixels
     "vsubl.u8   q1, d2, d3                     \n"
     "vadd.s16   q0, q0, q1                     \n"
     "vabs.s16   q0, q0                         \n"
     "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
     "bgt        1b                             \n"
   : "+r"(src_y0),      // %0
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 000000000..46e9ceb33
--- /dev/null
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3323 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUV422TORGB                                                            \
+    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
+    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
+    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
+    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
+    "vtrn.u8    d0, d1                         \n"                             \
+    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
+    "vmul.s16   q0, q0, q14                    \n"                             \
+    "vadd.s16   d18, d19                       \n"                             \
+    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
+    "vqadd.s16  d21, d1, d16                   \n"                             \
+    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
+    "vqadd.s16  d23, d1, d17                   \n"                             \
+    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
+    "vqadd.s16  d17, d1, d18                   \n"                             \
+    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
+    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
+    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
+    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
+    "vmovl.u8   q11, d1                        \n"                             \
+    "vmovl.u8   q8, d2                         \n"                             \
+    "vtrn.u8    d20, d21                       \n"                             \
+    "vtrn.u8    d22, d23                       \n"                             \
+    "vtrn.u8    d16, d17                       \n"                             \
+    "vmov.u8    d21, d16                       \n"
+
+static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
+                         0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : "r"(&kUVToRB),   // %5
+      "r"(&kUVToG)     // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : "r"(&kUVToRB),    // %5
+      "r"(&kUVToG)      // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_raw),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
+#define ARGBTORGB565                                                           \
+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555                                                         \
+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.8     {d24}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_YTOARGBROW_NEON
+void YToARGBRow_NEON(const uint8* src_y,
+                     uint8* dst_argb,
+                     int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
+    "vld1.8     {d25}, [%4]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : "r"(&kUVToRB),   // %3
+      "r"(&kUVToG)     // %4
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_YTOARGBROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.8     {d24}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {d25}, [%5]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : "r"(&kUVToRB),   // %4
+      "r"(&kUVToG)     // %5
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
+    "vld1.8     {d25}, [%4]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : "r"(&kUVToRB),   // %3
+      "r"(&kUVToG)     // %4
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {d24}, [%3]                    \n"
+    MEMACCESS(4)
+    "vld1.8     {d25}, [%4]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : "r"(&kUVToRB),   // %3
+      "r"(&kUVToG)     // %4
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+#ifdef HAS_SETROW_NEON
+void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+    "1:                                        \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
+}
+#endif  // HAS_SETROW_NEON
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+#ifdef HAS_ARGBSETROWS_NEON
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+                      int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    SetRow_NEON(dst, v32, width << 2);
+    dst += dst_stride;
+  }
+}
+#endif  // HAS_ARGBSETROWS_NEON
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  :
+  : "cc", "memory", "r12", "q0"
+  );
+}
+#endif  // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUVROW_NEON
+
+#ifdef HAS_HALFROW_NEON
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+                  uint8* dst_uv, int pix) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
+    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),         // %0
+    "+r"(src_uv_stride),  // %1
+    "+r"(dst_uv),         // %2
+    "+r"(pix)             // %3
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_HALFROW_NEON
+
+// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
+#ifdef HAS_ARGBTOBAYERROW_NEON
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix) {
+  asm volatile (
+    "vmov.u32   d6[0], %3                      \n"  // selector
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
+    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
+    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_bayer),  // %1
+    "+r"(pix)         // %2
+  : "r"(selector)     // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTOBAYERROW_NEON
+
+// Select G channels from ARGB.  e.g.  GGGGGGGG
+#ifdef HAS_ARGBTOBAYERGGROW_NEON
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 /*selector*/, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_bayer),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTOBAYERGGROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_NEON
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #64                        \n"
+    "beq        75f                            \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+    "cmp        %4, #192                       \n"
+    "beq        25f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q15, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+#endif  // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+#endif  // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELYROW_NEON
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc
index e47708802..106fda568 100644
--- a/third_party/libyuv/source/row_posix.cc
+++ b/third_party/libyuv/source/row_posix.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 2cfacad1b..8eb888926 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -8,15 +8,179 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
+
+#if defined (_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+
+#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127  /* min(127,(int8)(2.018 * 64)) */
+#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// 64 bit
+#if defined(_M_X64)
+
+// Aligned destination version.
+__declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const __m128i xmm4 = _mm_setzero_si128();
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm2 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_store_si128((__m128i *)dst_argb, xmm0);
+    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+
+// Unaligned destination version.
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                   const uint8* u_buf,
+                                   const uint8* v_buf,
+                                   uint8* dst_argb,
+                                   int width) {
+
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const __m128i xmm4 = _mm_setzero_si128();
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm2 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_load_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+// 32 bit
+#else  // defined(_M_X64)
 
 #ifdef HAS_ARGBTOYROW_SSSE3
 
@@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
 #ifdef HAS_I422TOARGBROW_AVX2
 
 static const lvec8 kUVToB_AVX = {
@@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = {
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_AVX2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         int width) {
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
     push       edi
@@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 
-static const vec8 kUVToB = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
-
-static const vec8 kVUToR = {
-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
 // Read 8 UV from 444.
@@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/third_party/libyuv/source/row_x86.asm b/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 000000000..0cb326f8e
--- /dev/null
+++ b/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
+    psrlw      m2, m2, 8
+%endif
+
+    ALIGN      4
+.convertloop:
+    mov%2      m0, [src_yuy2q]
+    mov%2      m1, [src_yuy2q + mmsize]
+    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+    pand       m0, m0, m2   ; YUY2 even bytes are Y
+    pand       m1, m1, m2
+%else
+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
+    psrlw      m1, m1, 8
+%endif
+    packuswb   m0, m0, m1
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+%endif
+    sub        pixd, mmsize
+    mov%2      [dst_yq], m0
+    lea        dst_yq, [dst_yq + mmsize]
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
+    psrlw      m4, m4, 8
+    sub        dst_vq, dst_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uvq]
+    mov%1      m1, [src_uvq + mmsize]
+    lea        src_uvq, [src_uvq + mmsize * 2]
+    psrlw      m2, m0, 8         ; odd bytes
+    psrlw      m3, m1, 8
+    pand       m0, m0, m4        ; even bytes
+    pand       m1, m1, m4
+    packuswb   m0, m0, m1
+    packuswb   m2, m2, m3
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+    vpermq     m2, m2, 0xd8
+%endif
+    mov%1      [dst_uq], m0
+    mov%1      [dst_uq + dst_vq], m2
+    lea        dst_uq, [dst_uq + mmsize]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+;                      int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+    sub        src_vq, src_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uq]
+    mov%1      m1, [src_vq]
+    lea        src_uq, [src_uq + mmsize]
+    punpcklbw  m2, m0, m1       // first 8 UV pairs
+    punpckhbw  m0, m0, m1       // next 8 UV pairs
+%if cpuflag(AVX2)
+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
+    mov%1      [dst_uvq], m1
+    mov%1      [dst_uvq + mmsize], m2
+%else
+    mov%1      [dst_uvq], m2
+    mov%1      [dst_uvq + mmsize], m0
+%endif
+    lea        dst_uvq, [dst_uvq + mmsize * 2]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc
index 31cedf11d..5b33b5f04 100644
--- a/third_party/libyuv/source/scale.cc
+++ b/third_party/libyuv/source/scale.cc
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/scale.h"
+#include "libyuv/scale.h"
 
 #include <assert.h>
 #include <string.h>
 
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyPlane
-#include "third_party/libyuv/include/libyuv/row.h"
-#include "third_party/libyuv/include/libyuv/scale_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 000000000..e339cd7c7
--- /dev/null
+++ b/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,809 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+        ScaleARGBRowDown2Box_SSE2);
+  }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
+        ScaleARGBRowDown2_NEON;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+  }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+        ScaleARGBRowDownEven_SSE2;
+  }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 4)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+        ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 15) & ~15;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc
index 595ad66ba..e4b2acc41 100644
--- a/third_party/libyuv/source/scale_common.cc
+++ b/third_party/libyuv/source/scale_common.cc
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/scale.h"
+#include "libyuv/scale.h"
 
 #include <assert.h>
 #include <string.h>
 
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyARGB
-#include "third_party/libyuv/include/libyuv/row.h"
-#include "third_party/libyuv/include/libyuv/scale_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc
index 5722dea80..3eb4f27c4 100644
--- a/third_party/libyuv/source/scale_mips.cc
+++ b/third_party/libyuv/source/scale_mips.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/basic_types.h"
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -18,7 +18,8 @@ extern "C" {
 
 // This module is for GCC MIPS DSPR2
 #if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
 
 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc
index 704cfd251..1b8a5ba58 100644
--- a/third_party/libyuv/source/scale_neon.cc
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,8 +28,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
+    MEMACCESS(0)
     "vld2.8     {q0, q1}, [%0]!                \n"
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -48,7 +50,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "add        %1, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
     "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
@@ -57,6 +61,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "vpadal.u8  q1, q3                         \n"
     "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
     "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -73,8 +78,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
     "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {d2}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -87,16 +94,20 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 
 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "add        r4, %0, %3                     \n"
-    "add        r5, r4, %3                     \n"
-    "add        %3, r5, %3                     \n"
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    "vld1.8     {q1}, [r4]!                    \n"
-    "vld1.8     {q2}, [r5]!                    \n"
-    "vld1.8     {q3}, [%3]!                    \n"
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
     "subs       %2, %2, #4                     \n"
     "vpaddl.u8  q0, q0                         \n"
     "vpadal.u8  q0, q1                         \n"
@@ -105,13 +116,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "vpaddl.u16 q0, q0                         \n"
     "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
     "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
     "vst1.32    {d0[0]}, [%1]!                 \n"
     "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(src_stride)         // %3
-  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
   );
 }
 
@@ -124,9 +139,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
   asm volatile (
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
     "subs       %2, %2, #24                  \n"
     "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
     "vst3.8     {d0, d1, d2}, [%1]!          \n"
     "bgt        1b                           \n"
   : "+r"(src_ptr),          // %0
@@ -145,7 +162,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
     "subs         %2, %2, #24                  \n"
 
@@ -182,6 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
     "vmlal.u8     q8, d3, d24                  \n"
     "vqrshrn.u16  d2, q8, #2                   \n"
 
+    MEMACCESS(1)
     "vst3.8       {d0, d1, d2}, [%1]!          \n"
 
     "bgt          1b                           \n"
@@ -202,7 +222,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
     "subs         %2, %2, #24                  \n"
     // average src line 0 with src line 1
@@ -222,6 +244,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
     "vmlal.u8     q3, d3, d24                  \n"
     "vqrshrn.u16  d2, q3, #2                   \n"
 
+    MEMACCESS(1)
     "vst3.8       {d0, d1, d2}, [%1]!          \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),          // %0
@@ -250,14 +273,18 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
+    MEMACCESS(3)
     "vld1.8     {q3}, [%3]                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
     "subs       %2, %2, #12                    \n"
     "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
     "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
     "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
     "vst1.32    {d5[0]}, [%1]!                 \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -272,11 +299,15 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
   asm volatile (
-    "vld1.16    {q13}, [%4]                    \n"
-    "vld1.8     {q14}, [%5]                    \n"
-    "vld1.8     {q15}, [%6]                    \n"
-    "add        r4, %0, %3, lsl #1             \n"
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
   "1:                                          \n"
@@ -285,9 +316,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
     "subs         %2, %2, #12                  \n"
 
     // Shuffle the input data around to get align the data
@@ -364,18 +398,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
+    MEMACCESS(1)
     "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
     "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  : "r"(&kMult38_Div6),     // %4
-    "r"(&kShuf38_2),        // %5
-    "r"(&kMult38_Div9)      // %6
-  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
-    "q13", "q14", "q15", "memory", "cc"
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
   );
 }
 
@@ -384,7 +420,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
+    MEMACCESS(4)
     "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
     "vld1.8     {q14}, [%5]                    \n"
     "add        %3, %0                         \n"
     ".p2align   2                              \n"
@@ -394,7 +432,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
     // d1 = 10 50 11 51 12 52 13 53
     // d2 = 20 60 21 61 22 62 23 63
     // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
     "subs         %2, %2, #12                  \n"
 
@@ -461,7 +501,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
     "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
     "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
 
+    MEMACCESS(1)
     "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
     "vst1.32      {d4[0]}, [%1]!               \n"
     "bgt          1b                           \n"
   : "+r"(src_ptr),       // %0
@@ -494,7 +536,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
     "vdup.8       d4, %4                       \n"
     // General purpose row blend.
   "1:                                          \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vmull.u8     q13, d0, d4                  \n"
@@ -503,50 +547,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
     "vmlal.u8     q14, d3, d5                  \n"
     "vrshrn.u16   d0, q13, #8                  \n"
     "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          1b                           \n"
     "b            99f                          \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          25b                          \n"
     "b            99f                          \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q1}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          50b                          \n"
     "b            99f                          \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
+    MEMACCESS(1)
     "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
     "vld1.8       {q0}, [%2]!                  \n"
     "subs         %3, %3, #16                  \n"
     "vrhadd.u8    q0, q1                       \n"
     "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          75b                          \n"
     "b            99f                          \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
+    MEMACCESS(1)
     "vld1.8       {q0}, [%1]!                  \n"
     "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
     "vst1.8       {q0}, [%0]!                  \n"
     "bgt          100b                         \n"
 
   "99:                                         \n"
+    MEMACCESS(0)
     "vst1.8       {d1[7]}, [%0]                \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
@@ -564,10 +621,14 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
+    MEMACCESS(0)
     "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
     "vld2.32    {q2, q3}, [%0]!                \n"
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
     "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
     "vst1.8     {q3}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -585,14 +646,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "add        %1, %1, %0                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
     "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
     "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
     "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
@@ -602,6 +667,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "vrshrn.u16 d1, q1, #2                     \n"
     "vrshrn.u16 d2, q2, #2                     \n"
     "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
     "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
     "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
@@ -621,11 +687,16 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
     "mov        r12, %3, lsl #2                \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
     "vld1.32    {d1[1]}, [%0], r12             \n"
     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
     "vst1.8     {q0}, [%1]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_argb),    // %0
@@ -646,13 +717,21 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
     "add        %1, %1, %0                     \n"
     ".p2align   2                              \n"
   "1:                                          \n"
+    MEMACCESS(0)
     "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
     "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
     "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
     "vld1.8     {d7}, [%1], r12                \n"
     "vaddl.u8   q0, d0, d1                     \n"
     "vaddl.u8   q1, d2, d3                     \n"
@@ -665,6 +744,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
     "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
     "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
     "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
     "vst1.8     {q0}, [%2]!                    \n"
     "bgt        1b                             \n"
   : "+r"(src_argb),    // %0
diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc
index 18b081026..352e66782 100644
--- a/third_party/libyuv/source/scale_posix.cc
+++ b/third_party/libyuv/source/scale_posix.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc
index bd5cca8af..840b9738d 100644
--- a/third_party/libyuv/source/scale_win.cc
+++ b/third_party/libyuv/source/scale_win.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
diff --git a/third_party/libyuv/source/video_common.cc b/third_party/libyuv/source/video_common.cc
new file mode 100644
index 000000000..efbedf46e
--- /dev/null
+++ b/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_RGB3, FOURCC_RAW },
+  {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/x86inc.asm b/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 000000000..cb5c32df3
--- /dev/null
+++ b/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho64
+        SECTION .text align=%1
+    %elifidn __OUTPUT_FORMAT__,macho
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        section .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .text align=%1
+    %endif
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rsp + stack_offset + %3]
+        %define r%1mp qword r %+ %1m
+    %else
+        %define r%1m [esp + stack_offset + %3]
+        %define r%1mp dword r %+ %1m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+    pop %1
+    %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    %if mmsize == 8
+        %assign xmm_regs_used 0
+    %else
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 6
+        SUB rsp, (xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    ASSERT regs_used >= num_args
+    PUSH_IF_USED 3, 4, 5, 6
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+    cglobal_internal %1 %+ SUFFIX
+%else
+    cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+    %ifndef cglobaled_%1
+        %xdefine %1 mangle(%1)
+        %xdefine %1.skip_prologue %1 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %1, 1
+    %endif
+    %xdefine current_function %1
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:function hidden
+    %else
+        global %1
+    %endif
+    align function_align
+    %1:
+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
+    %if %0 > 1
+        PROLOGUE %2
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 2+
+    %xdefine %1 mangle(%1)
+    global %1
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX      (1<<0)
+%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop      (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<22)
+%assign cpuflags_bmi1     (1<<23)
+%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+    %if %0 >= 1
+        %xdefine cpuname %1
+        %assign cpuflags cpuflags_%1
+        %if %0 >= 2
+            %xdefine cpuname %1_%2
+            %assign cpuflags cpuflags | cpuflags_%2
+        %endif
+        %xdefine SUFFIX _ %+ cpuname
+        %if cpuflag(AVX)
+            %assign AVX_enabled 1
+        %endif
+        %if mmsize == 16 && notcpuflag(SSE2)
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elifidn %1, SSE3
+            %define movu lddqu
+        %endif
+    %else
+        %xdefine SUFFIX
+        %undef cpuname
+        %undef cpuflags
+    %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign AVX_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova vmovaps
+    %define movu vmovups
+    %undef movh
+    %define movnta vmovntps
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine tmp%2 m%2
+    %xdefine ntmp%2 nm%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 tmp%2
+    %xdefine nm%1 ntmp%2
+    %undef tmp%2
+    %undef ntmp%2
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+    %xdefine tmp m%1
+    %xdefine m%1 m%2
+    %xdefine m%2 tmp
+    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE n, m%2, %2
+%else
+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+    ; Be careful using this mode in nested macros though, as in some cases there may be
+    ; other copies of m# that have already been dereferenced and don't get updated correctly.
+    %xdefine %%n1 n %+ %1
+    %xdefine %%n2 n %+ %2
+    %xdefine tmp m %+ %%n1
+    CAT_XDEFINE m, %%n1, m %+ %%n2
+    CAT_XDEFINE m, %%n2, tmp
+    CAT_XDEFINE n, m %+ %%n1, %%n1
+    CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+    %undef tmp
+    %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE n, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-AVX emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+    %ifid %6
+        %define %%sizeofreg sizeof%6
+    %elifid %5
+        %define %%sizeofreg sizeof%5
+    %else
+        %define %%sizeofreg mmsize
+    %endif
+    %if %%sizeofreg==32
+        %if %4>=3
+            v%1 %5, %6, %7
+        %else
+            v%1 %5, %6
+        %endif
+    %else
+        %if %%sizeofreg==8
+            %define %%regmov movq
+        %elif %2
+            %define %%regmov movaps
+        %else
+            %define %%regmov movdqa
+        %endif
+
+        %if %4>=3+%3
+            %ifnidn %5, %6
+                %if AVX_enabled && %%sizeofreg==16
+                    v%1 %5, %6, %7
+                %else
+                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                    %%regmov %5, %6
+                    %1 %5, %7
+                %endif
+            %else
+                %1 %5, %7
+            %endif
+        %elif %4>=3
+            %1 %5, %6, %7
+        %else
+            %1 %5, %6
+        %endif
+    %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+    %assign %%swap 0
+    %if AVX_enabled
+        %ifnid %6
+            %assign %%swap 1
+        %endif
+    %elifnidn %5, %6
+        %ifnid %7
+            %assign %%swap 1
+        %endif
+    %endif
+    %if %%swap && %3 == 0 && %8 == 1
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %else
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsdd,  pmulld, paddd
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 213467662..99453a998 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -234,10 +234,10 @@ ALIGNMODE k7
         %define r%1mp %2
     %elif ARCH_X86_64 ; memory
         %define r%1m [rsp + stack_offset + %6]
-        %define r%1mp qword r %+ %1m
+        %define r%1mp qword r %+ %1 %+ m
     %else
         %define r%1m [esp + stack_offset + %6]
-        %define r%1mp dword r %+ %1m
+        %define r%1mp dword r %+ %1 %+ m
     %endif
     %define r%1  %2
 %endmacro
diff --git a/tools_common.c b/tools_common.c
index 87fa317ce..66dbeee99 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -255,7 +255,7 @@ int vpx_img_read(vpx_image_t *img, FILE *file) {
 #endif
 
     for (y = 0; y < h; ++y) {
-      if (fread(buf, 1, w, file) != w)
+      if (fread(buf, 1, w, file) != (size_t)w)
         return 0;
       buf += stride;
     }
diff --git a/usage.dox b/usage.dox
index 92fd6b26e..237b8dc42 100644
--- a/usage.dox
+++ b/usage.dox
@@ -57,9 +57,6 @@
     the vpx_codec_get_caps() method. Attempts to invoke features not supported
     by an algorithm will generally result in #VPX_CODEC_INCAPABLE.
 
-    Currently defined features available in both encoders and decoders include:
-    - \subpage usage_xma
-
     \if decoder
     Currently defined decoder features include:
     - \ref usage_cb
@@ -70,9 +67,7 @@
     To initialize a codec instance, the address of the codec context
     and interface structures are passed to an initialization function. Depending
     on the \ref usage_features that the codec supports, the codec could be
-    initialized in different modes. Most notably, the application may choose to
-    use \ref usage_xma mode to gain fine grained control over how and where
-    memory is allocated for the codec.
+    initialized in different modes.
 
     To prevent cases of confusion where the ABI of the library changes,
     the ABI is versioned. The ABI version number must be passed at
@@ -136,73 +131,3 @@
     possible."
 
 */
-
-
-/*! \page usage_xma External Memory Allocation
-    Applications that wish to have fine grained control over how and where
-    decoders allocate memory \ref MAY make use of the eXternal Memory Allocation
-    (XMA) interface. Not all codecs support the XMA \ref usage_features.
-
-    To use a decoder in XMA mode, the decoder \ref MUST be initialized with the
-    vpx_codec_xma_init_ver() function. The amount of memory a decoder needs to
-    allocate is heavily dependent on the size of the encoded video frames. The
-    size of the video must be known before requesting the decoder's memory map.
-    This stream information can be obtained with the vpx_codec_peek_stream_info()
-    function, which does not require a constructed decoder context. If the exact
-    stream is not known, a stream info structure can be created that reflects
-    the maximum size that the decoder instance is required to support.
-
-    Once the decoder instance has been initialized and the stream information
-    determined, the application calls the vpx_codec_get_mem_map() iterator
-    repeatedly to get a list of the memory segments requested by the decoder.
-    The iterator value should be initialized to NULL to request the first
-    element, and the function will return #VPX_CODEC_LIST_END to signal the end of
-    the list.
-
-    After each segment is identified, it must be passed to the codec through the
-    vpx_codec_set_mem_map() function. Segments \ref MUST be passed in the same
-    order as they are returned from vpx_codec_get_mem_map(), but there is no
-    requirement that vpx_codec_get_mem_map() must finish iterating before
-    vpx_codec_set_mem_map() is called. For instance, some applications may choose
-    to get a list of all requests, construct an optimal heap, and then set all
-    maps at once with one call. Other applications may set one map at a time,
-    allocating it immediately after it is returned from vpx_codec_get_mem_map().
-
-    After all segments have been set using vpx_codec_set_mem_map(), the codec may
-    be used as it would be in normal internal allocation mode.
-
-    \section usage_xma_seg_id Segment Identifiers
-    Each requested segment is identified by an identifier unique to
-    that decoder type. Some of these identifiers are private, while others are
-    enumerated for application use. Identifiers not enumerated publicly are
-    subject to change. Identifiers are non-consecutive.
-
-    \section usage_xma_seg_szalign Segment Size and Alignment
-    The sz (size) and align (alignment) parameters describe the required size
-    and alignment of the requested segment. Alignment will always be a power of
-    two. Applications \ref MUST honor the alignment requested. Failure to do so
-    could result in program crashes or may incur a speed penalty.
-
-    \section usage_xma_seg_flags Segment Flags
-    The flags member of the segment structure indicates any requirements or
-    desires of the codec for the particular segment. The #VPX_CODEC_MEM_ZERO flag
-    indicates that the segment \ref MUST be zeroed by the application prior to
-    passing it to the application. The #VPX_CODEC_MEM_WRONLY flag indicates that
-    the segment will only be written into by the decoder, not read. If this flag
-    is not set, the application \ref MUST insure that the memory segment is
-    readable. On some platforms, framebuffer memory is writable but not
-    readable, for example. The #VPX_CODEC_MEM_FAST flag indicates that the segment
-    will be frequently accessed, and that it should be placed into fast memory,
-    if any is available. The application \ref MAY choose to place other segments
-    in fast memory as well, but the most critical segments will be identified by
-    this flag.
-
-    \section usage_xma_seg_basedtor Segment Base Address and Destructor
-    For each requested memory segment, the application must determine the
-    address of a memory segment that meets the requirements of the codec. This
-    address is set in the <code>base</code> member of the #vpx_codec_mmap
-    structure. If the application requires processing when the segment is no
-    longer used by the codec (for instance to deallocate it or close an
-    associated file descriptor) the <code>dtor</code> and <code>priv</code>
-    members can be set.
-*/
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7d9441d54..a46fbfbbd 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,8 +108,8 @@ extern "C"
          * For temporal denoiser: noise_sensitivity = 0 means off,
          * noise_sensitivity = 1 means temporal denoiser on for Y channel only,
          * noise_sensitivity = 2 means temporal denoiser on for all channels.
-         * noise_sensitivity = 3 will be used for aggressive mode in future.
-         * Temporal denoiser is enabled via the build option
+         * noise_sensitivity >= 3 means aggressive denoising mode.
+         * Temporal denoiser is enabled via the configuration option:
          * CONFIG_TEMPORAL_DENOISING.
          * For spatial denoiser: noise_sensitivity controls the amount of
          * pre-processing blur: noise_sensitivity = 0 means off.
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index f5870797b..fd9afd2ac 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -220,7 +220,8 @@ $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
 $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2/;
+# Disable neon while investigating https://code.google.com/p/webm/issues/detail?id=817
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2/;
 $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
 $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
 
diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c
index b7bb40c27..08be76e43 100644
--- a/vp8/encoder/arm/neon/denoising_neon.c
+++ b/vp8/encoder/arm/neon/denoising_neon.c
@@ -279,8 +279,8 @@ int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg,
       {
         const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block);
         const uint64x1_t _76543210 = vpaddl_u32(_7654_3210);
-        const unsigned int sum_block =
-            vget_lane_u32(vreinterpret_u32_u64(_76543210), 0);
+        const int sum_block =
+            vget_lane_s32(vreinterpret_s32_u64(_76543210), 0);
         if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
           return COPY_BLOCK;
         }
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 611421575..7c012a829 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -35,10 +35,6 @@ typedef struct
     unsigned char *buffer;
     unsigned char *buffer_end;
     struct vpx_internal_error_info *error;
-
-    /* Variables used to track bit costs without outputing to the bitstream */
-    unsigned int  measure_cost;
-    unsigned long bit_counter;
 } BOOL_CODER;
 
 extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 0f0a36a61..75401fc2b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
+
 #include "denoising.h"
 
 #include "vp8/common/reconinter.h"
@@ -333,8 +335,36 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     return FILTER_BLOCK;
 }
 
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
+  assert(mode > 0);  // Denoiser is allocated only if mode > 0.
+  if (mode == 1) {
+    denoiser->denoiser_mode = kDenoiserOnYOnly;
+  } else if (mode == 2) {
+    denoiser->denoiser_mode = kDenoiserOnYUV;
+  } else {
+    denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
+  }
+  if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
+    denoiser->denoise_pars.scale_sse_thresh = 1;
+    denoiser->denoise_pars.scale_motion_thresh = 8;
+    denoiser->denoise_pars.scale_increase_filter = 0;
+    denoiser->denoise_pars.denoise_mv_bias = 95;
+    denoiser->denoise_pars.pickmode_mv_bias = 100;
+    denoiser->denoise_pars.qp_thresh = 0;
+    denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+  } else {
+    denoiser->denoise_pars.scale_sse_thresh = 2;
+    denoiser->denoise_pars.scale_motion_thresh = 16;
+    denoiser->denoise_pars.scale_increase_filter = 1;
+    denoiser->denoise_pars.denoise_mv_bias = 60;
+    denoiser->denoise_pars.pickmode_mv_bias = 60;
+    denoiser->denoise_pars.qp_thresh = 100;
+    denoiser->denoise_pars.consec_zerolast = 10;
+  }
+}
+
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols)
+                          int num_mb_rows, int num_mb_cols, int mode)
 {
     int i;
     assert(denoiser);
@@ -369,10 +399,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
     vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
-
+    vp8_denoiser_set_parameters(denoiser, mode);
     return 0;
 }
 
+
 void vp8_denoiser_free(VP8_DENOISER *denoiser)
 {
     int i;
@@ -396,11 +427,12 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              loop_filter_info_n *lfi_n,
                              int mb_row,
                              int mb_col,
-                             int block_index,
-                             int uv_denoise)
+                             int block_index)
+
 {
     int mv_row;
     int mv_col;
+    unsigned int motion_threshold;
     unsigned int motion_magnitude2;
     unsigned int sse_thresh;
     int sse_diff_thresh = 0;
@@ -411,8 +443,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
 
     enum vp8_denoiser_decision decision = FILTER_BLOCK;
-    enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
-    enum vp8_denoiser_decision decision_v = FILTER_BLOCK;
+    enum vp8_denoiser_decision decision_u = COPY_BLOCK;
+    enum vp8_denoiser_decision decision_v = COPY_BLOCK;
 
     if (zero_frame)
     {
@@ -424,7 +456,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
         int sse_diff = 0;
         // Bias on zero motion vector sse.
-        int zero_bias = 95;
+        const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
         zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
         sse_diff = zero_mv_sse - best_sse;
 
@@ -502,14 +534,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     mv_row = x->best_sse_mv.as_mv.row;
     mv_col = x->best_sse_mv.as_mv.col;
     motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
-    sse_thresh = SSE_THRESHOLD;
-    if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+    motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+        NOISE_MOTION_THRESHOLD;
 
-    if (best_sse > sse_thresh || motion_magnitude2
-           > 8 * NOISE_MOTION_THRESHOLD)
-    {
-        decision = COPY_BLOCK;
-    }
+    if (motion_magnitude2 <
+        denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+      x->increase_denoising = 1;
+
+    sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+    if (x->increase_denoising)
+      sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+    if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+      decision = COPY_BLOCK;
 
     if (decision == FILTER_BLOCK)
     {
@@ -528,7 +565,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
             kFilterNonZeroMV : kFilterZeroMV;
         // Only denoise UV for zero motion, and if y channel was denoised.
-        if (uv_denoise &&
+        if (denoiser->denoiser_mode != kDenoiserOnYOnly &&
             motion_magnitude2 == 0 &&
             decision == FILTER_BLOCK) {
           unsigned char *mc_running_avg_u =
@@ -565,7 +602,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                 denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
         denoiser->denoise_state[block_index] = kNoFilter;
     }
-    if (uv_denoise) {
+    if (denoiser->denoiser_mode != kDenoiserOnYOnly) {
       if (decision_u == COPY_BLOCK) {
         vp8_copy_mem8x8(
             x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index a1f195b72..89832d3c2 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -39,16 +39,47 @@ enum vp8_denoiser_filter_state {
   kFilterNonZeroMV
 };
 
+enum vp8_denoiser_mode {
+  kDenoiserOff,
+  kDenoiserOnYOnly,
+  kDenoiserOnYUV,
+  kDenoiserOnYUVAggressive
+};
+
+typedef struct {
+  // Scale factor on sse threshold above which no denoising is done.
+  unsigned int scale_sse_thresh;
+  // Scale factor on motion magnitude threshold above which no
+  // denoising is done.
+  unsigned int scale_motion_thresh;
+  // Scale factor on motion magnitude below which we increase the strength of
+  // the temporal filter (in function vp8_denoiser_filter).
+  unsigned int scale_increase_filter;
+  // Scale factor to bias to ZEROMV for denoising.
+  unsigned int denoise_mv_bias;
+  // Scale factor to bias to ZEROMV for coding mode selection.
+  unsigned int pickmode_mv_bias;
+  // Quantizer threshold below which we use the segmentation map to switch off
+  // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+  // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+  // 0 when segmentation map is used for shutting off loop filter.
+  unsigned int qp_thresh;
+  // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+  unsigned int consec_zerolast;
+} denoise_params;
+
 typedef struct vp8_denoiser
 {
     YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
     YV12_BUFFER_CONFIG yv12_mc_running_avg;
     unsigned char* denoise_state;
     int num_mb_cols;
+    int denoiser_mode;
+    denoise_params denoise_pars;
 } VP8_DENOISER;
 
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
-                          int num_mb_rows, int num_mb_cols);
+                          int num_mb_rows, int num_mb_cols, int mode);
 
 void vp8_denoiser_free(VP8_DENOISER *denoiser);
 
@@ -61,8 +92,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              loop_filter_info_n *lfi_n,
                              int mb_row,
                              int mb_col,
-                             int block_index,
-                             int uv_denoise);
+                             int block_index);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e6b0f9b64..aec6b9880 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi,
             }
 
 #endif
+            // Keep track of how many (consecutive) times a block is coded
+            // as ZEROMV_LASTREF, for base layer frames.
+            // Reset to 0 if its coded as anything else.
+            if (cpi->current_layer == 0) {
+              if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                  xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+                // Increment, check for wrap-around.
+                if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                  cpi->consec_zero_last[map_index+mb_col] += 1;
+              } else {
+                cpi->consec_zero_last[map_index+mb_col] = 0;
+              }
+            }
 
             /* Special case code for cyclic refresh
              * If cyclic update enabled then copy xd->mbmi.segment_id; (which
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d4b17cef1..7b8b51f30 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         }
 
 #endif
+                        // Keep track of how many (consecutive) times a  block
+                        // is coded as ZEROMV_LASTREF, for base layer frames.
+                        // Reset to 0 if its coded as anything else.
+                        if (cpi->current_layer == 0) {
+                          if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+                              xd->mode_info_context->mbmi.ref_frame ==
+                                  LAST_FRAME) {
+                            // Increment, check for wrap-around.
+                            if (cpi->consec_zero_last[map_index+mb_col] < 255)
+                              cpi->consec_zero_last[map_index+mb_col] +=
+                                  1;
+                          } else {
+                            cpi->consec_zero_last[map_index+mb_col] = 0;
+                          }
+                        }
 
                         /* Special case code for cyclic refresh
                          * If cyclic update enabled then copy
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 373dbebd9..7140f2f1b 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
         while(block_count && i != cpi->cyclic_refresh_mode_index);
 
         cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+        if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
+            Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+          // Under aggressive denoising mode, use segmentation to turn off loop
+          // filter below some qp thresh. The loop filter is turned off for all
+          // blocks that have been encoded as ZEROMV LAST x frames in a row,
+          // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+          // This is to avoid "dot" artifacts that can occur from repeated
+          // loop filtering on noisy input source.
+          cpi->cyclic_refresh_q = Q;
+          lf_adjustment = -MAX_LOOP_FILTER;
+          for (i = 0; i < mbs_in_frame; ++i) {
+            seg_map[i] = (cpi->consec_zero_last[i] >
+                          cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+          }
+        }
+#endif
     }
 
     /* Activate segmentation. */
@@ -1259,6 +1277,15 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 
     vpx_free(cpi->tplist);
     CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0) {
+      vp8_denoiser_free(&cpi->denoiser);
+      vp8_denoiser_allocate(&cpi->denoiser, width, height,
+                            cm->mb_rows, cm->mb_cols,
+                            cpi->oxcf.noise_sensitivity);
+    }
+#endif
 }
 
 
@@ -1405,7 +1432,7 @@ static void update_layer_contexts (VP8_COMP *cpi)
         double prev_layer_framerate=0;
 
         assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS);
-        for (i=0; i<oxcf->number_of_layers; i++)
+        for (i = 0; i < oxcf->number_of_layers && i < VPX_TS_MAX_LAYERS; ++i)
         {
             LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
@@ -1752,7 +1779,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
         int width = (cpi->oxcf.Width + 15) & ~15;
         int height = (cpi->oxcf.Height + 15) & ~15;
         vp8_denoiser_allocate(&cpi->denoiser, width, height,
-                              cpi->common.mb_rows, cpi->common.mb_cols);
+                              cm->mb_rows, cm->mb_cols,
+                              cpi->oxcf.noise_sensitivity);
       }
     }
 #endif
@@ -1879,6 +1907,13 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
      */
     cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
     cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 5;
+    if (cpi->oxcf.number_of_layers == 1) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+    } else if (cpi->oxcf.number_of_layers == 2) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+    }
     cpi->cyclic_refresh_mode_index = 0;
     cpi->cyclic_refresh_q = 32;
 
@@ -1889,6 +1924,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     else
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
+    CHECK_MEM_ERROR(cpi->consec_zero_last,
+                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
 #ifdef VP8_ENTROPY_STATS
     init_context_counters();
 #endif
@@ -2409,6 +2447,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
+    vpx_free(cpi->consec_zero_last);
 
     vp8_remove_common(&cpi->common);
     vpx_free(cpi);
@@ -3232,17 +3271,9 @@ static void update_reference_frames(VP8_COMP *cpi)
         if (cm->frame_type == KEY_FRAME)
         {
             int i;
-            vp8_yv12_copy_frame(
-                    cpi->Source,
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            vp8_yv12_extend_frame_borders(
-                    &cpi->denoiser.yv12_running_avg[LAST_FRAME]);
-
-            for (i = 2; i < MAX_REF_FRAMES - 1; i++)
-                vp8_yv12_copy_frame(
-                        &cpi->denoiser.yv12_running_avg[LAST_FRAME],
-                        &cpi->denoiser.yv12_running_avg[i]);
+            for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
+              vp8_yv12_copy_frame(cpi->Source,
+                                  &cpi->denoiser.yv12_running_avg[i]);
         }
         else /* For non key frames */
         {
@@ -3479,6 +3510,9 @@ static void encode_frame_to_data_rate
         {
             cpi->mb.rd_thresh_mult[i] = 128;
         }
+
+        // Reset the zero_last counter to 0 on key frame.
+        vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
     }
 
 #if 0
@@ -3900,6 +3934,7 @@ static void encode_frame_to_data_rate
 
 #endif
 
+
 #ifdef OUTPUT_YUV_SRC
     vp8_write_yuv_frame(yuv_file, cpi->Source);
 #endif
@@ -3995,6 +4030,8 @@ static void encode_frame_to_data_rate
                 else
                   disable_segmentation(cpi);
               }
+              // Reset the consec_zero_last counter on key frame.
+              vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
               vp8_set_quantizer(cpi, Q);
             }
 
@@ -5037,7 +5074,8 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
                 /* Update frame rates for each layer */
                 assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS);
-                for (i=0; i<cpi->oxcf.number_of_layers; i++)
+                for (i = 0; i < cpi->oxcf.number_of_layers &&
+                     i < VPX_TS_MAX_LAYERS; ++i)
                 {
                     LAYER_CONTEXT *lc = &cpi->layer_context[i];
                     lc->framerate = cpi->ref_framerate /
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index df17dff34..7a8baca77 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP
     int cyclic_refresh_mode_index;
     int cyclic_refresh_q;
     signed char *cyclic_refresh_map;
+    // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+    unsigned char *consec_zero_last;
 
     // Frame counter for the temporal pattern. Counter is rest when the temporal
     // layers are changed dynamically (run-time change).
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 86108b70a..d0ad7212d 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 
-
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
      */
     calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
 
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      rd_adjustment = (int)(rd_adjustment *
+          cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+    }
+#endif
+
     /* if we encode a new mv this is important
      * find the best new motion vector
      */
@@ -1168,7 +1174,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-        int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
         if (x->best_sse_inter_mode == DC_PRED)
         {
@@ -1183,8 +1188,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
-                                block_index, uv_denoise);
-
+                                block_index);
 
         /* Reevaluate ZEROMV after denoising. */
         if (best_mbmode.ref_frame == INTRA_FRAME &&
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 98d60160c..2f6f5d07c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2511,7 +2511,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity)
     {
-        int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
         if (x->best_sse_inter_mode == DC_PRED)
         {
@@ -2525,8 +2524,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
-                                block_index, uv_denoise);
-
+                                block_index);
 
         /* Reevaluate ZEROMV after denoising. */
         if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 76c1582fd..db27ba5fc 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1318,9 +1318,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         "vp8.fpf"           /* first pass filename */
 #endif
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
-#ifdef CONFIG_SPATIAL_SVC
         {0},
-#endif
         {0},                /* ss_target_bitrate */
         1,                  /* ts_number_layers */
         {0},                /* ts_target_bitrate */
@@ -1328,7 +1326,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         0,                  /* ts_periodicity */
         {0},                /* ts_layer_id */
     }},
-    { -1, {NOT_IMPLEMENTED}}
 };
 
 
@@ -1345,8 +1342,6 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
     vp8e_init,          /* vpx_codec_init_fn_t       init; */
     vp8e_destroy,       /* vpx_codec_destroy_fn_t    destroy; */
     vp8e_ctf_maps,      /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-    NOT_IMPLEMENTED,    /* vpx_codec_get_mmap_fn_t   get_mmap; */
-    NOT_IMPLEMENTED,    /* vpx_codec_set_mmap_fn_t   set_mmap; */
     {
         NOT_IMPLEMENTED,    /* vpx_codec_peek_si_fn_t    peek_si; */
         NOT_IMPLEMENTED,    /* vpx_codec_get_si_fn_t     get_si; */
@@ -1354,6 +1349,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
         NOT_IMPLEMENTED,    /* vpx_codec_frame_get_fn_t  frame_get; */
     },
     {
+        1,                  /* 1 cfg map */
         vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
         vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
         vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index fb3c236ce..9a0cdb79a 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -60,6 +60,7 @@ struct vpx_codec_alg_priv
     vpx_decrypt_cb          decrypt_cb;
     void                    *decrypt_state;
     vpx_image_t             img;
+    int                     flushed;
     int                     img_setup;
     struct frame_buffers    yv12_frame_buffers;
     void                    *user_priv;
@@ -88,6 +89,7 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
     ctx->priv->alg_priv->decrypt_cb = NULL;
     ctx->priv->alg_priv->decrypt_state = NULL;
+    ctx->priv->alg_priv->flushed = 0;
     ctx->priv->init_flags = ctx->init_flags;
 
     if (ctx->config.dec)
@@ -328,6 +330,13 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     unsigned int resolution_change = 0;
     unsigned int w, h;
 
+    if (data == NULL && data_sz == 0) {
+      ctx->flushed = 1;
+      return VPX_CODEC_OK;
+    }
+
+    /* Reset flushed when receiving a valid frame */
+    ctx->flushed = 0;
 
     /* Update the input fragment data */
     if(update_fragments(ctx, data, data_sz, &res) <= 0)
@@ -737,8 +746,9 @@ static vpx_codec_err_t vp8_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
     if (corrupted && pbi)
     {
-        *corrupted = pbi->common.frame_to_show->corrupted;
-
+        const YV12_BUFFER_CONFIG *const frame = pbi->common.frame_to_show;
+        if (frame == NULL) return VPX_CODEC_ERROR;
+        *corrupted = frame->corrupted;
         return VPX_CODEC_OK;
     }
     else
@@ -794,8 +804,6 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
     vp8_init,         /* vpx_codec_init_fn_t       init; */
     vp8_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
     vp8_ctf_maps,     /* vpx_codec_ctrl_fn_map_t  *ctrl_maps; */
-    NOT_IMPLEMENTED,  /* vpx_codec_get_mmap_fn_t   get_mmap; */
-    NOT_IMPLEMENTED,  /* vpx_codec_set_mmap_fn_t   set_mmap; */
     {
         vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
@@ -804,6 +812,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
         NOT_IMPLEMENTED,
     },
     { /* encoder functions */
+        0,
         NOT_IMPLEMENTED,
         NOT_IMPLEMENTED,
         NOT_IMPLEMENTED,
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 638d39247..a0d078d33 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -34,7 +34,7 @@ void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
 
   cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
   cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
-  cm->mi_stride = cm->mi_cols + MI_BLOCK_SIZE;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
 
   cm->mb_cols = (cm->mi_cols + 1) >> 1;
   cm->mb_rows = (cm->mi_rows + 1) >> 1;
@@ -60,16 +60,18 @@ static int alloc_mi(VP9_COMMON *cm, int mi_size) {
 
   for (i = 0; i < 2; ++i) {
     cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(*cm->mip));
+        (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO));
     if (cm->mip_array[i] == NULL)
       return 1;
 
     cm->mi_grid_base_array[i] =
-        (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+        (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
     if (cm->mi_grid_base_array[i] == NULL)
       return 1;
   }
 
+  cm->mi_alloc_size = mi_size;
+
   // Init the index.
   cm->mi_idx = 0;
   cm->prev_mi_idx = 1;
@@ -131,7 +133,8 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   vp9_free_context_buffers(cm);
 
   vp9_set_mb_mi(cm, width, height);
-  if (alloc_mi(cm, cm->mi_stride * (cm->mi_rows + MI_BLOCK_SIZE))) goto fail;
+  if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
+    goto fail;
 
   cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
   if (!cm->last_frame_seg_map) goto fail;
diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c
index 2404cfcb7..dab8f9617 100644
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -146,10 +146,4 @@ void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y) {
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
-#if CONFIG_ALPHA
-  // TODO(jkoleszar): Using the Y w/h for now
-  xd->plane[3].plane_type = PLANE_TYPE_Y;
-  xd->plane[3].subsampling_x = 0;
-  xd->plane[3].subsampling_y = 0;
-#endif
 }
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b9a04dae0..bc6f9ebc5 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -125,9 +125,9 @@ typedef struct {
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
-  uint8_t skip;
-  uint8_t segment_id;
-  uint8_t seg_id_predicted;  // valid only when temporal_update is enabled
+  int8_t skip;
+  int8_t segment_id;
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
   // Only for INTRA blocks
   PREDICTION_MODE uv_mode;
@@ -169,11 +169,7 @@ enum mv_precision {
   MV_PRECISION_Q4
 };
 
-#if CONFIG_ALPHA
-enum { MAX_MB_PLANE = 4 };
-#else
 enum { MAX_MB_PLANE = 3 };
-#endif
 
 struct buf_2d {
   uint8_t *buf;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8248cc532..feb90ce66 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -49,7 +49,7 @@ extern "C" {
 #define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
 
 static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255u : (val < 0) ? 0u : val;
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
 
 static INLINE int clamp(int value, int low, int high) {
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index eb0692048..2aadf0ee8 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -205,12 +205,13 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
       break;
     default:
       assert(0 && "Invalid transform size.");
+      break;
   }
 
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const INLINE scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                          PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 9036c54cf..8817fdbb9 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -26,11 +26,11 @@ extern "C" {
 #define MI_MASK (MI_BLOCK_SIZE - 1)
 
 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
-// 00: Profile 0.  8-bit color with 4:2:0 chroma sampling only.
-// 10: Profile 1.  8-bit color with 4:4:4, 4:2:2, or 4:4:0 chroma sampling.
-// 01: Profile 2.  10-bit and 12-bit color, with 4:2:0 chroma sampling.
-// 110: Profile 3. 10-bit and 12-bit color, with 4:2:2, 4:4:4, or 4:4:0
-//                 chroma sampling.
+// 00: Profile 0.  8-bit 4:2:0 only.
+// 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
+// 01: Profile 2.  10-bit and 12-bit color only, with 4:2:0 sampling.
+// 110: Profile 3. 10-bit and 12-bit color only, with 4:2:2/4:4:4/4:4:0
+//                 sampling.
 // 111: Undefined profile.
 typedef enum BITSTREAM_PROFILE {
   PROFILE_0,
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index eaa4a5210..cd0a962e4 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -89,6 +89,22 @@ static const tran_high_t sinpi_4_9 = 15212;
 
 static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  #if CONFIG_VP9_HIGH
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  #elif CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid VP9 input streams, intermediate stage coefficients should always
+  // stay within the range of a signed 16 bit integer. Coefficients can go out
+  // of this range for invalid/corrupt VP9 streams. However, strictly checking
+  // this range for every intermediate coefficient can burdensome for a decoder,
+  // therefore the following assertion is only enabled when configured with
+  // --enable-coefficient-range-checking.
+  assert(INT16_MIN <= rv);
+  assert(rv <= INT16_MAX);
+#endif
   return (tran_low_t)rv;
 }
 
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 0fe58c5c8..ab64d3036 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -20,7 +20,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              int block, int mi_row, int mi_col) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = cm->coding_use_prev_mi && cm->prev_mi
+  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
         ? cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]
         : NULL;
   const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->mbmi : NULL;
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 7bce3fa37..a937b7823 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -125,7 +125,7 @@ static const int idx_n_column_to_subblock[4][2] = {
 // clamp_mv_ref
 #define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 
-static void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
+static INLINE void clamp_mv_ref(MV *mv, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
                xd->mb_to_right_edge + MV_BORDER,
                xd->mb_to_top_edge - MV_BORDER,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 342416190..b6b664a0b 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -68,9 +68,6 @@ typedef struct VP9Common {
 
   DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
-#if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
-#endif
 
   COLOR_SPACE color_space;
 
@@ -138,16 +135,13 @@ typedef struct VP9Common {
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;
-#if CONFIG_ALPHA
-  int a_dc_delta_q;
-  int a_ac_delta_q;
-#endif
 
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
 
   int mi_idx;
   int prev_mi_idx;
+  int mi_alloc_size;
   MODE_INFO *mip_array[2];
   MODE_INFO **mi_grid_base_array[2];
 
@@ -199,11 +193,6 @@ typedef struct VP9Common {
   int error_resilient_mode;
   int frame_parallel_decoding_mode;
 
-  // Flag indicates if prev_mi can be used in coding:
-  //   0: encoder assumes decoder does not have prev_mi
-  //   1: encoder assumes decoder has and uses prev_mi
-  unsigned int coding_use_prev_mi;
-
   int log2_tile_cols, log2_tile_rows;
 
   // Private data associated with the frame buffer callbacks.
@@ -218,6 +207,15 @@ typedef struct VP9Common {
   ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;
 
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES)
+    return NULL;
+  if (cm->ref_frame_map[index] < 0)
+    return NULL;
+  assert(cm->ref_frame_map[index] < REF_FRAMES);
+  return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
   return &cm->frame_bufs[cm->new_fb_idx].buf;
 }
@@ -282,6 +280,11 @@ static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   }
 }
 
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units.
+  return len + MI_BLOCK_SIZE;
+}
+
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh,
                                   int mi_col, int bw,
@@ -349,7 +352,7 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd,
 
 #if CONFIG_VP9_HIGH
 static INLINE unsigned int bit_depth_to_bps(vpx_bit_depth_t bit_depth) {
-  int bps;
+  int bps = 8;
   switch (bit_depth) {
     case VPX_BITS_8:
       bps = 8;
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index 47d6bd21a..ff848487f 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -467,19 +467,13 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                         + 0.0065 + 0.5);
   int i;
 
-  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                  src->alpha_buffer};
-  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                              src->alpha_stride};
-  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,
-                             src->alpha_width};
-  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,
-                              src->alpha_height};
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
 
-  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
-                            dst->alpha_buffer};
-  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
-                              dst->alpha_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
 #if CONFIG_VP9_HIGH
@@ -508,19 +502,13 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                         + 0.0065 + 0.5);
   int i;
 
-  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                  src->alpha_buffer};
-  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                              src->alpha_stride};
-  const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width,
-                             src->alpha_width};
-  const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height,
-                              src->alpha_height};
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
 
-  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
-                            dst->alpha_buffer};
-  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
-                              dst->alpha_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     const int src_stride = src_strides[i];
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index bc9d6ef5e..014638466 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -353,9 +353,9 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
   const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
   const int has_above = above_mbmi != NULL;
   const int has_left = left_mbmi != NULL;
-  int above_ctx = (has_above && !above_mbmi->skip) ? above_mbmi->tx_size
+  int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
                                                    : max_tx_size;
-  int left_ctx = (has_left && !left_mbmi->skip) ? left_mbmi->tx_size
+  int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
                                                 : max_tx_size;
   if (!has_left)
     left_ctx = above_ctx;
@@ -366,7 +366,7 @@ int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
   return (above_ctx + left_ctx) > max_tx_size;
 }
 
-int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 1a7ba86e4..2c965068a 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -26,7 +26,7 @@ static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
   return xd->left_available ? xd->mi[-1] : NULL;
 }
 
-int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
+int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col);
 
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index f36148035..fa0e36da4 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -44,21 +44,12 @@ typedef int8_t vp9_tree_index;
 typedef const vp9_tree_index vp9_tree[];
 
 static INLINE vp9_prob clip_prob(int p) {
-  return (p > 255) ? 255u : (p < 1) ? 1u : p;
+  return (p > 255) ? 255 : (p < 1) ? 1 : p;
 }
 
-// int64 is not needed for normal frame level calculations.
-// However when outputting entropy stats accumulated over many frames
-// or even clips we can overflow int math.
-#ifdef ENTROPY_STATS
 static INLINE vp9_prob get_prob(int num, int den) {
   return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);
 }
-#else
-static INLINE vp9_prob get_prob(int num, int den) {
-  return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den);
-}
-#endif
 
 static INLINE vp9_prob get_binary_prob(int n0, int n1) {
   return get_prob(n0, n0 + n1);
diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c
index f418b5cab..b42e654f5 100644
--- a/vp9/common/vp9_quant_common.c
+++ b/vp9/common/vp9_quant_common.c
@@ -237,6 +237,7 @@ int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
       return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #elif CONFIG_VP9_HIGH
   switch (bit_depth) {
@@ -248,6 +249,7 @@ int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
       return dc_qlookup[clamp(qindex + delta, 0, MAXQ)] << 4;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #else
   (void) bit_depth;
@@ -266,6 +268,7 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
       return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)];
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #elif CONFIG_VP9_HIGH
   switch (bit_depth) {
@@ -277,6 +280,7 @@ int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) {
       return ac_qlookup[clamp(qindex + delta, 0, MAXQ)] << 4;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #else
   (void) bit_depth;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index a0c2070cb..05c2a1364 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -769,13 +769,13 @@ add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int sourc
 specialize qw/vp9_variance32x64/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -784,13 +784,13 @@ add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source
 specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_variance8x8 mmx neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
+specialize qw/vp9_get8x8var mmx neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -802,10 +802,10 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -832,13 +832,13 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_
 specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -856,7 +856,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_p
 specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance8x8 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -909,7 +909,7 @@ add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stri
 specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
+specialize qw/vp9_sad8x8 mmx neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vp9_sad8x4/, "$sse2_x86inc";
@@ -1008,7 +1008,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const
 specialize qw/vp9_sad4x4x8 sse4/;
 
 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2/;
 
 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -1023,7 +1023,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, co
 specialize qw/vp9_sad16x32x4d sse2/;
 
 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2/;
 
 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x16x4d sse2/;
@@ -1064,7 +1064,7 @@ specialize qw/vp9_get_mb_ss mmx sse2/;
 # ENCODEMB INVOKE
 
 add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block/, "$sse2_x86inc";
+specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
 
 if (vpx_config("CONFIG_VP9_HIGH") eq "yes") {
 # the transform coefficients are held in 32-bit
@@ -1088,7 +1088,7 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") {
   specialize qw/vp9_block_error avx2/;
 
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+  specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
@@ -1112,6 +1112,7 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
 }
 
 # fdct functions
+
 if (vpx_config("CONFIG_VP9_HIGH") eq "yes") {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4/;
@@ -1153,10 +1154,10 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") {
   specialize qw/vp9_fdct32x32_rd/;
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2 avx2/;
+  specialize qw/vp9_fht4x4 sse2/;
 
   add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2 avx2/;
+  specialize qw/vp9_fht8x8 sse2/;
 
   add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht16x16 sse2/;
@@ -1168,13 +1169,13 @@ if (vpx_config("CONFIG_VP9_HIGH") eq "yes") {
   specialize qw/vp9_fdct4x4_1 sse2/;
 
   add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4 sse2 avx2/;
+  specialize qw/vp9_fdct4x4 sse2/;
 
   add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8_1 sse2/;
+  specialize qw/vp9_fdct8x8_1 sse2 neon/;
 
   add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
+  specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct16x16_1 sse2/;
diff --git a/vp9/common/vp9_seg_common.c b/vp9/common/vp9_seg_common.c
index 345f2a1e6..e07fa1b37 100644
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -52,11 +52,10 @@ int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
 
 void vp9_set_segdata(struct segmentation *seg, int segment_id,
                      SEG_LVL_FEATURES feature_id, int seg_data) {
-  const int data_max = vp9_seg_feature_data_max(feature_id);
-  assert(seg_data <= data_max);
+  assert(seg_data <= vp9_seg_feature_data_max(feature_id));
   if (seg_data < 0) {
     assert(seg_feature_data_signed[feature_id]);
-    assert(-seg_data <= data_max);
+    assert(-seg_data <= vp9_seg_feature_data_max(feature_id));
   }
 
   seg->feature_data[segment_id][feature_id] = seg_data;
diff --git a/vp9/common/x86/vp9_postproc_x86.h b/vp9/common/x86/vp9_postproc_x86.h
deleted file mode 100644
index cab9d34f2..000000000
--- a/vp9/common/x86/vp9_postproc_x86.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_
-#define VP9_COMMON_X86_VP9_POSTPROC_X86_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx);
-extern prototype_postproc(vp9_post_proc_down_and_across_mmx);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_mmx
-
-#undef  vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx
-
-#undef  vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm);
-extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm);
-extern prototype_postproc(vp9_post_proc_down_and_across_xmm);
-extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef  vp9_postproc_down
-#define vp9_postproc_down vp9_mbpost_proc_down_xmm
-
-#undef  vp9_postproc_across
-#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm
-
-#undef  vp9_postproc_downacross
-#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm
-
-#undef  vp9_postproc_addnoise
-#define vp9_postproc_addnoise vp9_plane_add_noise_wmt
-
-
-#endif
-#endif
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_COMMON_X86_VP9_POSTPROC_X86_H_
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index d0d5eadbe..ba5d6b5a5 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -194,8 +194,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
                                     int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
-    TX_TYPE tx_type;
-    const PLANE_TYPE plane_type = pd->plane_type;
+    TX_TYPE tx_type = DCT_DCT;
     tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_VP9_HIGH
     if (xd->cur_buf->flags&YV12_FLAG_HIGH) {
@@ -203,6 +202,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
         tx_type = DCT_DCT;
         vp9_high_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bps);
       } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
             tx_type = get_tx_type_4x4(plane_type, xd, block);
@@ -229,6 +229,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
         tx_type = DCT_DCT;
         vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
       } else {
+        const PLANE_TYPE plane_type = pd->plane_type;
         switch (tx_size) {
           case TX_4X4:
             tx_type = get_tx_type_4x4(plane_type, xd, block);
@@ -256,6 +257,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
       tx_type = DCT_DCT;
       vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
     } else {
+      const PLANE_TYPE plane_type = pd->plane_type;
       switch (tx_size) {
         case TX_4X4:
           tx_type = get_tx_type_4x4(plane_type, xd, block);
@@ -684,13 +686,20 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
 }
 
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Width and height beyond allowed size.");
+#endif
   if (cm->width != width || cm->height != height) {
-    // Change in frame size (assumption: color format does not change).
-    if (cm->width == 0 || cm->height == 0 ||
-        width * height > cm->width * cm->height) {
+    const int new_rows = ALIGN_POWER_OF_TWO(height,
+                                            MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_cols = ALIGN_POWER_OF_TWO(width,
+                                            MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    if (calc_mi_size(new_rows) * calc_mi_size(new_cols) > cm->mi_alloc_size) {
       if (vp9_alloc_context_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffers");
+                           "Failed to allocate context buffers");
     } else {
       vp9_set_mb_mi(cm, width, height);
     }
@@ -724,6 +733,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
                                        struct vp9_read_bit_buffer *rb) {
   int width, height;
   int found = 0, i;
+  int has_valid_ref_frame = 0;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
@@ -737,15 +747,21 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   if (!found)
     vp9_read_frame_size(rb, &width, &height);
 
-  // Check that each of the frames that this frame references has valid
-  // dimensions.
+  if (width <=0 || height <= 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
-    if (!valid_ref_frame_size(ref_frame->buf->y_width, ref_frame->buf->y_height,
-                              width, height))
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Referenced frame has invalid size");
+    has_valid_ref_frame |= valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                                                ref_frame->buf->y_crop_height,
+                                                width, height);
   }
+  if (!has_valid_ref_frame)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
 
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
@@ -1180,8 +1196,8 @@ static void read_bitdepth_colorspace_sampling(
     }
   } else {
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
-      // Note: If colorspace is SRGB then only 4:4:4 chroma sampling
-      // is supported.
+      // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
+      // 4:2:2 or 4:4:0 chroma sampling is not allowed.
       cm->subsampling_y = cm->subsampling_x = 0;
       if (vp9_rb_read_bit(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
@@ -1235,6 +1251,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     if (!vp9_read_sync_code(rb))
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame sync code");
+
     read_bitdepth_colorspace_sampling(cm, rb);
     pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
@@ -1264,6 +1281,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
         cm->color_space = BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
       }
+
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
     } else {
@@ -1301,11 +1319,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     }
   }
   if (!cm->error_resilient_mode) {
-    cm->coding_use_prev_mi = 1;
     cm->refresh_frame_context = vp9_rb_read_bit(rb);
     cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
   } else {
-    cm->coding_use_prev_mi = 0;
     cm->refresh_frame_context = 0;
     cm->frame_parallel_decoding_mode = 1;
   }
@@ -1481,7 +1497,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
 
   init_macroblockd(cm, &pbi->mb);
 
-  if (cm->coding_use_prev_mi)
+  if (!cm->error_resilient_mode)
     set_prev_mi(cm);
   else
     cm->prev_mi = NULL;
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 1afaee1e3..32e80f93b 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -435,6 +435,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    const int ref_idx = frame - LAST_FRAME;
+    if (cm->frame_refs[ref_idx].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[ref_idx].sf.y_scale_fp == REF_INVALID_SCALE )
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
     vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
                      mi_row, mi_col);
   }
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 0343c214c..1a4155825 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -123,8 +123,12 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
    * later commit that adds VP9-specific controls for this functionality.
    */
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    const YV12_BUFFER_CONFIG *const cfg =
-        &cm->frame_bufs[cm->ref_frame_map[0]].buf;
+    const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, 0);
+    if (cfg == NULL) {
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "No 'last' reference frame");
+      return VPX_CODEC_ERROR;
+    }
     if (!equal_dimensions(cfg, sd))
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
@@ -181,17 +185,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
   return cm->error.error_code;
 }
 
-
-int vp9_get_reference_dec(VP9Decoder *pbi, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9_COMMON *cm = &pbi->common;
-
-  if (index < 0 || index >= REF_FRAMES)
-    return -1;
-
-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
-  return 0;
-}
-
 /* If any buffer updating is signaled it should be done here. */
 static void swap_frame_buffers(VP9Decoder *pbi) {
   int ref_index = 0, mask;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 8e16e1cac..223b66fc7 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -74,9 +74,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd);
 
-int vp9_get_reference_dec(struct VP9Decoder *pbi,
-                          int index, YV12_BUFFER_CONFIG **fb);
-
 struct VP9Decoder *vp9_decoder_create();
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
diff --git a/vp9/decoder/vp9_reader.h b/vp9/decoder/vp9_reader.h
index 32e200e2b..2d9eccfbf 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vp9/decoder/vp9_reader.h
@@ -52,7 +52,7 @@ int vp9_reader_has_error(vp9_reader *r);
 
 const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
-static int vp9_read(vp9_reader *r, int prob) {
+static INLINE int vp9_read(vp9_reader *r, int prob) {
   unsigned int bit = 0;
   BD_VALUE value;
   BD_VALUE bigsplit;
@@ -89,11 +89,11 @@ static int vp9_read(vp9_reader *r, int prob) {
   return bit;
 }
 
-static int vp9_read_bit(vp9_reader *r) {
+static INLINE int vp9_read_bit(vp9_reader *r) {
   return vp9_read(r, 128);  // vp9_prob_half
 }
 
-static int vp9_read_literal(vp9_reader *r, int bits) {
+static INLINE int vp9_read_literal(vp9_reader *r, int bits) {
   int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
@@ -102,8 +102,8 @@ static int vp9_read_literal(vp9_reader *r, int bits) {
   return literal;
 }
 
-static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
-                         const vp9_prob *probs) {
+static INLINE int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
+                                const vp9_prob *probs) {
   vp9_tree_index i = 0;
 
   while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+                                            vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+                                            vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+                                            vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+                                            vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from vp9_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 000000000..2d5ec79b3
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          int zbin_oq_value, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  int i;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)zbin_oq_value;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+
+    for (i = 0; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+      const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+      const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+                                           vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+                                           vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+                                            vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan =
+          vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    {
+      const int16x4_t v_eobmax_3210 =
+          vmax_s16(vget_low_s16(v_eobmax_76543210),
+                   vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}
diff --git a/vp9/encoder/arm/neon/vp9_sad_neon.c b/vp9/encoder/arm/neon/vp9_sad_neon.c
index fe40b5452..c4cd85680 100644
--- a/vp9/encoder/arm/neon/vp9_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_sad_neon.c
@@ -26,9 +26,8 @@ static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                 vreinterpret_u32_u64(vget_high_u64(b)));
   return vget_lane_u32(c, 0);
 }
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_lo,
-                                               const uint16x8_t vec_hi) {
-  const uint32x4_t a = vpaddlq_u16(vaddq_u16(vec_lo, vec_hi));
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
   const uint64x2_t b = vpaddlq_u32(a);
   const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                                 vreinterpret_u32_u64(vget_high_u64(b)));
@@ -93,7 +92,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
                             vget_high_u8(vec_ref_16));
   }
-  return horizontal_add_16x8(vec_accum_lo, vec_accum_hi);
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
@@ -112,5 +111,20 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
     vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
                             vget_high_u8(vec_ref));
   }
-  return horizontal_add_16x8(vec_accum_lo, vec_accum_hi);
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
 }
diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c
new file mode 100644
index 000000000..b4bf567db
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subtract_neon.c
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+void vp9_subtract_block_neon(int rows, int cols,
+                             int16_t *diff, ptrdiff_t diff_stride,
+                             const uint8_t *src, ptrdiff_t src_stride,
+                             const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  if (cols > 16) {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; c += 32) {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
+                                                 vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
+                                                 vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
+                                                 vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
+                                                 vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+      }
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else if (cols > 8) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
+                                            vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
+                                            vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else if (cols > 4) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  } else {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c)
+        diff[c] = src[c] - pred[c];
+
+      diff += diff_stride;
+      pred += pred_stride;
+      src  += src_stride;
+    }
+  }
+}
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 000000000..816fbda1f
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,227 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth8 = 8 };
+enum { kHeight8 = 8 };
+enum { kHeight8PlusOne = 9 };
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kWidth32 = 32 };
+enum { kHeight32 = 32 };
+enum { kHeight32PlusOne = 33 };
+enum { kPixelStepOne = 1 };
+enum { kAlign16 = 16 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
+                   kHeight8, sse, sum);
+}
+
+unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+                   kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const int16_t *vp9_filter) {
+  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const int16_t *vp9_filter) {
+  const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+  const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
+                                            int src_stride,
+                                            int xoffset,
+                                            int yoffset,
+                                            const uint8_t *dst,
+                                            int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
+                            kHeight8PlusOne, kWidth8,
+                            BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
+                            kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight16PlusOne, kWidth16,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+                             kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
+
+void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
+                   kHeight32, sse, sum);
+}
+
+unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+}
+
+unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
+  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+                             kHeight32PlusOne, kWidth32,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
+                             kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index bfcbee7b0..d9654d2b5 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -80,13 +80,13 @@ static void prob_diff_update(const vp9_tree_index *tree,
     vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMP *cpi,
+static void write_selected_tx_size(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *xd,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cpi->common.fc.tx_probs);
+                                                 &cm->fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -95,14 +95,13 @@ static void write_selected_tx_size(const VP9_COMP *cpi,
   }
 }
 
-static int write_skip(const VP9_COMP *cpi, int segment_id, const MODE_INFO *mi,
-                      vp9_writer *w) {
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
+static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MODE_INFO *mi, vp9_writer *w) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
+    vp9_write(w, skip, vp9_get_skip_prob(cm, xd));
     return skip;
   }
 }
@@ -123,7 +122,7 @@ static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
 }
 
 static void pack_mb_tokens(vp9_writer *w,
-                           TOKENEXTRA **tp, const TOKENEXTRA *stop,
+                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth) {
   TOKENEXTRA *p = *tp;
 
@@ -198,9 +197,8 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 }
 
 // This function encodes the reference frame
-static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                             vp9_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -262,7 +260,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
     }
   }
 
-  skip = write_skip(cpi, segment_id, mi, w);
+  skip = write_skip(cm, xd, segment_id, mi, w);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
     vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
@@ -270,7 +268,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(is_inter &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
+    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
   }
 
   if (!is_inter) {
@@ -291,7 +289,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   } else {
     const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
     const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
-    write_ref_frames(cpi, w);
+    write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
@@ -339,10 +337,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
   }
 }
 
-static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
-                              vp9_writer *w) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+                              MODE_INFO **mi_8x8, vp9_writer *w) {
   const struct segmentation *const seg = &cm->seg;
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride];
@@ -353,10 +349,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
   if (seg->update_map)
     write_segment_id(w, seg, mbmi->segment_id);
 
-  write_skip(cpi, mbmi->segment_id, mi, w);
+  write_skip(cm, xd, mbmi->segment_id, mi, w);
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, mbmi->tx_size, bsize, w);
+    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
 
   if (bsize >= BLOCK_8X8) {
     write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
@@ -378,9 +374,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          vp9_writer *w, TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MODE_INFO *m;
 
@@ -392,7 +389,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd->mi, w);
+    write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
     pack_inter_mode_mvs(cpi, m, w);
   }
@@ -401,7 +398,8 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
   pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
 }
 
-static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
+static void write_partition(const VP9_COMMON *const cm,
+                            const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
                             PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -423,21 +421,23 @@ static void write_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static void write_modes_sb(VP9_COMP *cpi,
-                           const TileInfo *const tile,
-                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                           const TileInfo *const tile, vp9_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
+  const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
   const int bsl = b_width_log2(bsize);
   const int bs = (1 << bsl) / 4;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  const MODE_INFO *m = NULL;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
   partition = partition_lookup[bsl][m->mbmi.sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
@@ -479,8 +479,8 @@ static void write_modes_sb(VP9_COMP *cpi,
 }
 
 static void write_modes(VP9_COMP *cpi,
-                        const TileInfo *const tile,
-                        vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
+                        const TileInfo *const tile, vp9_writer *w,
+                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
   int mi_row, mi_col;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
@@ -530,7 +530,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
-      /* dry run to see if there is any udpate at all needed */
+      /* dry run to see if there is any update at all needed */
       int savings = 0;
       int update[2] = {0, 0};
       for (i = 0; i < PLANE_TYPES; ++i) {
@@ -741,7 +741,7 @@ static void write_delta_q(struct vp9_write_bit_buffer *wb, int delta_q) {
   }
 }
 
-static void encode_quantization(VP9_COMMON *cm,
+static void encode_quantization(const VP9_COMMON *const cm,
                                 struct vp9_write_bit_buffer *wb) {
   vp9_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
   write_delta_q(wb, cm->y_dc_delta_q);
@@ -749,12 +749,11 @@ static void encode_quantization(VP9_COMMON *cm,
   write_delta_q(wb, cm->uv_ac_delta_q);
 }
 
-
-static void encode_segmentation(VP9_COMP *cpi,
+static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
                                 struct vp9_write_bit_buffer *wb) {
   int i, j;
 
-  struct segmentation *seg = &cpi->common.seg;
+  const struct segmentation *seg = &cm->seg;
 
   vp9_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
@@ -764,7 +763,7 @@ static void encode_segmentation(VP9_COMP *cpi,
   vp9_wb_write_bit(wb, seg->update_map);
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
-    vp9_choose_segmap_coding_method(cpi);
+    vp9_choose_segmap_coding_method(cm, xd);
     // Write out probabilities used to decode unpredicted  macro-block segments
     for (i = 0; i < SEG_TREE_PROBS; i++) {
       const int prob = seg->tree_probs[i];
@@ -812,7 +811,6 @@ static void encode_segmentation(VP9_COMP *cpi,
   }
 }
 
-
 static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
   // Mode
   vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
@@ -881,7 +879,8 @@ static void fix_interp_filter(VP9_COMMON *cm) {
   }
 }
 
-static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
+static void write_tile_info(const VP9_COMMON *const cm,
+                            struct vp9_write_bit_buffer *wb) {
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
@@ -900,28 +899,29 @@ static void write_tile_info(VP9_COMMON *cm, struct vp9_write_bit_buffer *wb) {
 }
 
 static int get_refresh_mask(VP9_COMP *cpi) {
-    if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-        cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
-      // Preserve the previously existing golden frame and update the frame in
-      // the alt ref slot instead. This is highly specific to the use of
-      // alt-ref as a forward reference, and this needs to be generalized as
-      // other uses are implemented (like RTC/temporal scaling)
-      //
-      // gld_fb_idx and alt_fb_idx need to be swapped for future frames, but
-      // that happens in vp9_encoder.c:update_reference_frames() so that it can
-      // be done outside of the recode loop.
-      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-             (cpi->refresh_golden_frame << cpi->alt_fb_idx);
-    } else {
-      int arf_idx = cpi->alt_fb_idx;
-      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        arf_idx = gf_group->arf_update_idx[gf_group->index];
-      }
-      return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-             (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
-             (cpi->refresh_alt_ref_frame << arf_idx);
+  if (vp9_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term we leave it in the GF slot and,
+    // if we're updating the GF with the current decoded frame, we save it
+    // instead to the ARF slot.
+    // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we
+    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+    // there so that it can be done outside of the recode loop.
+    // Note: This is highly specific to the use of ARF as a forward reference,
+    // and this needs to be generalized as other uses are implemented
+    // (like RTC/temporal scalability).
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+  } else {
+    int arf_idx = cpi->alt_fb_idx;
+    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      arf_idx = gf_group->arf_update_idx[gf_group->index];
     }
+    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+           (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+           (cpi->refresh_alt_ref_frame << arf_idx);
+  }
 }
 
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
@@ -1101,10 +1101,12 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
     if (cm->intra_only) {
       write_sync_code(wb);
+
       // Note for profile 0, 420 8bpp is assumed.
       if (cm->profile > PROFILE_0) {
         write_bitdepth_colorspace_sampling(cm, wb);
       }
+
       vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cm, wb);
     } else {
@@ -1134,7 +1136,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
-  encode_segmentation(cpi, wb);
+  encode_segmentation(cm, &cpi->mb.e_mbd, wb);
 
   write_tile_info(cm, wb);
 }
@@ -1226,11 +1228,9 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   saved_wb = wb;
   vp9_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
-  uncompressed_hdr_size = vp9_rb_bytes_written(&wb);
+  uncompressed_hdr_size = vp9_wb_bytes_written(&wb);
   data += uncompressed_hdr_size;
 
-  vp9_compute_update_table();
-
   vp9_clear_system_state();
 
   first_part_size = write_compressed_header(cpi, data);
@@ -1242,4 +1242,3 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
 
   *size = data - dest;
 }
-
diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h
index ddfd0ed4f..8e82d1c97 100644
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -16,11 +16,21 @@
 extern "C" {
 #endif
 
-struct VP9_COMP;
+#include "vp9/encoder/vp9_encoder.h"
 
 void vp9_entropy_mode_init();
 
-void vp9_pack_bitstream(struct VP9_COMP *cpi, uint8_t *dest, size_t *size);
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
+
+static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
+  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
+         cpi->rc.is_src_frame_alt_ref &&
+         (!cpi->use_svc ||      // Add spatial svc base layer case here
+          (is_spatial_svc(cpi) &&
+           cpi->svc.spatial_layer_id == 0 &&
+           cpi->svc.layer_context[0].gold_ref_idx >=0 &&
+           cpi->oxcf.ss_play_alternate[0]));
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 1e5dc1a4c..a640e30c9 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -42,6 +42,7 @@ struct macroblock_plane {
   int16_t *zbin;
   int16_t *round;
 
+  int64_t quant_thred[2];
   // Zbin Over Quant value
   int16_t zbin_extra;
 };
@@ -67,7 +68,7 @@ struct macroblock {
   int sadperbit4;
   int rddiv;
   int rdmult;
-  unsigned int mb_energy;
+  int mb_energy;
 
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
@@ -116,7 +117,9 @@ struct macroblock {
   int quant_fp;
 
   // skip forward transform and quantization
-  int skip_txfm;
+  int skip_txfm[MAX_MB_PLANE];
+
+  int64_t bsse[MAX_MB_PLANE];
 
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index d6d5efd96..50c3fb706 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -34,7 +34,7 @@ typedef struct {
   int is_coded;
   int num_4x4_blk;
   int skip;
-  int skip_txfm;
+  int skip_txfm[MAX_MB_PLANE];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
@@ -42,7 +42,7 @@ typedef struct {
   int64_t tx_rd_diff[TX_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
   unsigned int zeromv_sse;
   PREDICTION_MODE best_sse_inter_mode;
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index ca84a1b39..90ea9cc25 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -345,10 +345,9 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     int refresh_last_frame) {
   if (frame_type == KEY_FRAME) {
     int i;
-    copy_frame(denoiser->running_avg_y[LAST_FRAME], src);
-    for (i = 2; i < MAX_REF_FRAMES - 1; i++) {
-      copy_frame(denoiser->running_avg_y[i],
-                 denoiser->running_avg_y[LAST_FRAME]);
+    // Start at 1 so as not to overwrite the INTRA_FRAME
+    for (i = 1; i < MAX_REF_FRAMES; ++i) {
+      copy_frame(denoiser->running_avg_y[i], src);
     }
   } else {  /* For non key frames */
     if (refresh_alt_ref_frame) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 49b24b407..a609b6dc6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -247,11 +247,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void duplicate_mode_info_in_sb(VP9_COMMON * const cm,
-                                     MACROBLOCKD *const xd,
-                                     int mi_row,
-                                     int mi_col,
-                                     BLOCK_SIZE bsize) {
+static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize) {
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
   int i, j;
@@ -319,6 +317,8 @@ typedef enum {
 
 static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
   int i;
+  node->part_variances = NULL;
+  vpx_memset(node->split, 0, sizeof(node->split));
   switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
@@ -350,6 +350,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
     }
     default: {
       assert(0);
+      break;
     }
   }
 }
@@ -612,10 +613,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     }
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
-    else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
                                         mi_row, mi_col, bsize, 1);
-      vp9_init_plane_quantizers(cpi, x);
     }
   }
 
@@ -712,10 +712,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
-  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                               src->alpha_buffer};
-  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                          src->alpha_stride};
+  uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer };
+  const int strides[3] = {src->y_stride, src->uv_stride, src->uv_stride };
   int i;
 
   // Set current frame pointer.
@@ -882,9 +880,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void update_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const MACROBLOCK *const x = &cpi->mb;
+static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -996,7 +992,7 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
-    update_stats(cpi);
+    update_stats(&cpi->common, &cpi->mb);
 
     (*tp)->token = EOSB_TOKEN;
     (*tp)++;
@@ -1069,6 +1065,7 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
       break;
     default:
       assert("Invalid partition type.");
+      break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -1405,19 +1402,17 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   }
 
   x->skip = ctx->skip;
-  x->skip_txfm = mbmi->segment_id ? 0 : ctx->skip_txfm;
+  x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
 }
 
 static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
                         TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize,
                      PICK_MODE_CONTEXT *ctx) {
-
-
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) {
     vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col,
                          MAX(BLOCK_8X8, bsize), ctx);
@@ -1425,7 +1420,7 @@ static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
 #endif
 
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
-  update_stats(cpi);
+  update_stats(&cpi->common, &cpi->mb);
 
   (*tp)->token = EOSB_TOKEN;
   (*tp)++;
@@ -1448,7 +1443,6 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
     return;
 
   if (bsize >= BLOCK_8X8) {
-    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
     const int idx_str = xd->mi_stride * mi_row + mi_col;
     MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
     ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1496,6 +1490,7 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
       break;
     default:
       assert("Invalid partition type.");
+      break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -1547,7 +1542,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   pc_tree->partitioning = partition;
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (bsize == BLOCK_16X16) {
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
@@ -1674,6 +1669,7 @@ static void rd_use_partition(VP9_COMP *cpi,
       break;
     default:
       assert(0);
+      break;
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1809,10 +1805,9 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
 //
 // The min and max are assumed to have been initialized prior to calling this
 // function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
-                                        BLOCK_SIZE * min_block_size,
-                                        BLOCK_SIZE * max_block_size ) {
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size ) {
   int sb_width_in_blocks = MI_BLOCK_SIZE;
   int sb_height_in_blocks  = MI_BLOCK_SIZE;
   int i, j;
@@ -1867,17 +1862,17 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
     if (cm->frame_type != KEY_FRAME) {
       MODE_INFO **const prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(cpi, prev_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
       MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(cpi, left_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size);
     }
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
       MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(cpi, above_sb64_mi, &min_size, &max_size);
+      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size);
     }
     // adjust observed min and max
     if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
@@ -1920,7 +1915,7 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   BLOCK_SIZE max_size = BLOCK_8X8;
   int bsl = mi_width_log2(BLOCK_64X64);
   const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
-                                     get_chessboard_index(cm)) % 2;
+                       get_chessboard_index(cm->current_video_frame)) & 0x1;
   // Trap case where we do not have a prediction.
   if (search_range_ctrl &&
       (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -1964,6 +1959,60 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   *max_block_size = max_size;
 }
 
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+  int mi_width  = num_8x8_blocks_wide_lookup[bsize];
+  int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  MODE_INFO *mi;
+  MODE_INFO **prev_mi =
+      &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+  BLOCK_SIZE bs, min_size, max_size;
+
+  min_size = BLOCK_64X64;
+  max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        mi = prev_mi[idy * cm->mi_stride + idx];
+        bs = mi ? mi->mbmi.sb_type : bsize;
+        min_size = MIN(min_size, bs);
+        max_size = MAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      mi = xd->mi[idy * cm->mi_stride - 1];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = MIN(min_size, bs);
+      max_size = MAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = xd->mi[idx - cm->mi_stride];
+      bs = mi ? mi->mbmi.sb_type : bsize;
+      min_size = MIN(min_size, bs);
+      max_size = MAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = min_size;
+  *max_bs = max_size;
+}
+
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
@@ -1972,13 +2021,58 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] =
+  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+
+typedef enum {
+  MV_ZERO = 0,
+  MV_LEFT = 1,
+  MV_UP = 2,
+  MV_RIGHT = 3,
+  MV_DOWN = 4,
+  MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+    return MV_ZERO;
+  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+    return MV_LEFT;
+  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+    return MV_RIGHT;
+  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+    return MV_UP;
+  } else {
+    return MV_DOWN;
+  }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                               TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
-                              int64_t *dist, int do_recon, int64_t best_rd,
+                              int64_t *dist, int64_t best_rd,
                               PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -1995,12 +2089,21 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   int64_t sum_rd = 0;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
+
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
   const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
+  BLOCK_SIZE min_size = cpi->sf.min_partition_size;
+  BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
   int partition_none_allowed = !force_horz_split && !force_vert_split;
   int partition_horz_allowed = !force_vert_split && yss <= xss &&
                                bsize >= BLOCK_8X8;
@@ -2011,22 +2114,28 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
-  if (bsize == BLOCK_16X16) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+        + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
   }
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (cpi->sf.auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
-                               bsize >= cpi->sf.min_partition_size);
-    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+    partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
                                 force_horz_split);
-    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+    partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
                                 force_vert_split);
-    do_split &= bsize > cpi->sf.min_partition_size;
+    do_split &= bsize > min_size;
   }
   if (cpi->sf.use_square_partition_only) {
     partition_horz_allowed &= force_horz_split;
@@ -2056,6 +2165,65 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+                                                  mi_row, mi_col, bsize);
+  }
+#endif
+
+#if CONFIG_FP_MB_STATS
+  // Decide whether we shall split directly and skip searching NONE by using
+  // the first pass block statistics
+  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_split &&
+      partition_none_allowed && src_diff_var > 4 &&
+      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+    int mb_row = mi_row >> 1;
+    int mb_col = mi_col >> 1;
+    int mb_row_end =
+        MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+    int mb_col_end =
+        MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+    int r, c;
+
+    // compute a complexity measure, basically measure inconsistency of motion
+    // vectors obtained from the first pass in the current block
+    for (r = mb_row; r < mb_row_end ; r++) {
+      for (c = mb_col; c < mb_col_end; c++) {
+        const int mb_index = r * cm->mb_cols + c;
+
+        MOTION_DIRECTION this_mv;
+        MOTION_DIRECTION right_mv;
+        MOTION_DIRECTION bottom_mv;
+
+        this_mv =
+            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+        // to its right
+        if (c != mb_col_end - 1) {
+          right_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+          none_complexity += get_motion_inconsistency(this_mv, right_mv);
+        }
+
+        // to its bottom
+        if (r != mb_row_end - 1) {
+          bottom_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+        }
+
+        // do not count its left and top neighbors to avoid double counting
+      }
+    }
+
+    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+      partition_none_allowed = 0;
+    }
+  }
+#endif
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
@@ -2066,6 +2234,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         this_rate += cpi->partition_cost[pl][PARTITION_NONE];
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
+
       if (sum_rd < best_rd) {
         int64_t stop_thresh = 4096;
         int64_t stop_thresh_rd;
@@ -2087,6 +2256,52 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           do_split = 0;
           do_rect = 0;
         }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_split != 0 &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_MOTION_ZERO_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_SMALL_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            if (src_diff_var == UINT_MAX) {
+              set_offsets(cpi, tile, mi_row, mi_col, bsize);
+              src_diff_var = get_sby_perpixel_diff_variance(
+                  cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
+            }
+            if (src_diff_var < 8) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
+        }
+#endif
       }
     }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2109,17 +2324,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
             ctx->mic.mbmi.interp_filter;
       rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                        pc_tree->leaf_split[0], best_rd, 0);
-      if (sum_rate == INT_MAX) {
+      if (sum_rate == INT_MAX)
         sum_rd = INT64_MAX;
-      } else {
+      else
         sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-        if (sum_rd < best_rd) {
-          update_state(cpi, pc_tree->leaf_split[0], mi_row, mi_col, subsize, 0);
-          encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
-                            pc_tree->leaf_split[0]);
-          update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-        }
-      }
     } else {
       for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
       const int x_idx = (i & 1) * mi_step;
@@ -2131,8 +2339,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         if (cpi->sf.adaptive_motion_search)
           load_pred_mv(x, ctx);
 
+        pc_tree->split[i]->index = i;
         rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                          subsize, &this_rate, &this_dist, i != 3,
+                          subsize, &this_rate, &this_dist,
                           best_rd - sum_rd, pc_tree->split[i]);
 
         if (this_rate == INT_MAX) {
@@ -2149,6 +2358,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+
       if (sum_rd < best_rd) {
         best_rate = sum_rate;
         best_dist = sum_dist;
@@ -2261,6 +2471,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     }
     restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
   }
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
@@ -2269,7 +2480,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   *rate = best_rate;
   *dist = best_dist;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
 
     // Check the projected output rate for this SB against it's target
@@ -2326,6 +2537,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     }
 
     vp9_zero(cpi->mb.pred_mv);
+    cpi->pc_root->index = 0;
 
     if ((sf->partition_search_type == SEARCH_PARTITION &&
          sf->use_lastframe_partitioning) ||
@@ -2357,11 +2569,11 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
       } else {
         GF_GROUP * gf_grp = &cpi->twopass.gf_group;
         int last_was_mid_sequence_overlay = 0;
-        if ((cpi->pass == 2) && (gf_grp->index)) {
+        if ((cpi->oxcf.pass == 2) && (gf_grp->index)) {
           if (gf_grp->update_type[gf_grp->index - 1] == OVERLAY_UPDATE)
             last_was_mid_sequence_overlay = 1;
         }
-        if ((cm->current_video_frame
+        if ((cpi->rc.frames_since_key
             % sf->last_partitioning_redo_frequency) == 0
             || last_was_mid_sequence_overlay
             || cm->prev_mi == 0
@@ -2379,7 +2591,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                                     &sf->max_partition_size);
           }
           rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                            &dummy_rate, &dummy_dist, 1, INT64_MAX,
+                            &dummy_rate, &dummy_dist, INT64_MAX,
                             cpi->pc_root);
         } else {
           if (sf->constrain_copy_partition &&
@@ -2401,7 +2613,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                                 &sf->max_partition_size);
       }
       rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root);
+                        &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root);
     }
   }
 }
@@ -2456,41 +2668,23 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
   else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
     return ALTREF_FRAME;
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
-    return LAST_FRAME;
-  else
     return GOLDEN_FRAME;
+  else
+    return LAST_FRAME;
 }
 
 static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->mb.e_mbd.lossless) {
+  if (cpi->mb.e_mbd.lossless)
     return ONLY_4X4;
-  } else if (cpi->common.current_video_frame == 0) {
+  if (cpi->common.frame_type == KEY_FRAME)
     return TX_MODE_SELECT;
-  } else {
-    if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
-      return ALLOW_32X32;
-    } else if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-      const RD_OPT *const rd_opt = &cpi->rd;
-      const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
-      return rd_opt->tx_select_threshes[frame_type][ALLOW_32X32] >
-                 rd_opt->tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-                     ALLOW_32X32 : TX_MODE_SELECT;
-    } else if (cpi->sf.tx_size_search_method == USE_TX_8X8) {
-      return TX_MODE_SELECT;
-    } else {
-      unsigned int total = 0;
-      int i;
-      for (i = 0; i < TX_SIZES; ++i)
-        total += cpi->tx_stepdown_count[i];
-
-      if (total) {
-        const double fraction = (double)cpi->tx_stepdown_count[0] / total;
-        return fraction > 0.90 ? ALLOW_32X32 : TX_MODE_SELECT;
-      } else {
-        return cpi->common.tx_mode;
-      }
-    }
-  }
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return ALLOW_32X32;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
+           cpi->sf.tx_size_search_method == USE_TX_8X8)
+    return TX_MODE_SELECT;
+  else
+    return cpi->common.tx_mode;
 }
 
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
@@ -2579,6 +2773,8 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                                  int mi_col, BLOCK_SIZE bsize, int *rate,
                                  int64_t *dist, int do_recon, int64_t best_rd,
                                  PC_TREE *pc_tree) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2610,18 +2806,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
-  if (cpi->sf.auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
-                               bsize >= cpi->sf.min_partition_size);
-    partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+  if (sf->auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= sf->max_partition_size &&
+                               bsize >= sf->min_partition_size);
+    partition_horz_allowed &= ((bsize <= sf->max_partition_size &&
+                                bsize > sf->min_partition_size) ||
                                 force_horz_split);
-    partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
-                                bsize >  cpi->sf.min_partition_size) ||
+    partition_vert_allowed &= ((bsize <= sf->max_partition_size &&
+                                bsize > sf->min_partition_size) ||
                                 force_vert_split);
-    do_split &= bsize > cpi->sf.min_partition_size;
+    do_split &= bsize > sf->min_partition_size;
   }
-  if (cpi->sf.use_square_partition_only) {
+  if (sf->use_square_partition_only) {
     partition_horz_allowed &= force_horz_split;
     partition_vert_allowed &= force_vert_split;
   }
@@ -2631,7 +2827,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
                         &this_rate, &this_dist, bsize, ctx);
     ctx->mic.mbmi = xd->mi[0]->mbmi;
-    ctx->skip_txfm = x->skip_txfm;
+    ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
 
     if (this_rate != INT_MAX) {
@@ -2700,7 +2896,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     } else {
       // skip rectangular partition test when larger block size
       // gives better rd cost
-      if (cpi->sf.less_rectangular_check)
+      if (sf->less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
   }
@@ -2708,7 +2904,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.adaptive_motion_search)
+    if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
     nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
@@ -2716,7 +2912,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                         &pc_tree->horizontal[0]);
 
     pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
-    pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+    pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
 
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -2728,7 +2924,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                           &pc_tree->horizontal[1]);
 
       pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
-      pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
 
       if (this_rate == INT_MAX) {
@@ -2753,14 +2949,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
 
-    if (cpi->sf.adaptive_motion_search)
+    if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
 
     nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
                         &this_rate, &this_dist, subsize,
                         &pc_tree->vertical[0]);
     pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
-    pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+    pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
     if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
@@ -2769,7 +2965,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                           &this_rate, &this_dist, subsize,
                           &pc_tree->vertical[1]);
       pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
-      pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
       if (this_rate == INT_MAX) {
         sum_rd = INT64_MAX;
@@ -2810,12 +3006,12 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     // Check the projected output rate for this SB against it's target
     // and and if necessary apply a Q delta using segmentation to get
     // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+    if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
       vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
                                     best_rate);
     }
 
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
                                               best_rate, best_dist);
 
@@ -2860,20 +3056,20 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
                           subsize, &pc_tree->none);
       pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
-      pc_tree->none.skip_txfm = x->skip_txfm;
+      pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
       break;
     case PARTITION_VERT:
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
                           subsize, &pc_tree->vertical[0]);
       pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
-      pc_tree->vertical[0].skip_txfm = x->skip_txfm;
+      pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
       if (mi_col + hbs < cm->mi_cols) {
         nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
                             &rate, &dist, subsize, &pc_tree->vertical[1]);
         pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
-        pc_tree->vertical[1].skip_txfm = x->skip_txfm;
+        pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
@@ -2886,13 +3082,13 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
                           subsize, &pc_tree->horizontal[0]);
       pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
-      pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
+      pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
       if (mi_row + hbs < cm->mi_rows) {
         nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
                             &rate, &dist, subsize, &pc_tree->horizontal[0]);
         pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
-        pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
+        pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
         if (rate != INT_MAX && dist != INT64_MAX &&
             *totrate != INT_MAX && *totdist != INT64_MAX) {
@@ -2933,6 +3129,7 @@ static void nonrd_use_partition(VP9_COMP *cpi,
       break;
     default:
       assert("Invalid partition type.");
+      break;
   }
 
   if (bsize == BLOCK_64X64 && output_enabled) {
@@ -2945,9 +3142,10 @@ static void nonrd_use_partition(VP9_COMP *cpi,
 
 static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                                 int mi_row, TOKENEXTRA **tp) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int mi_col;
 
   // Initialize the left context for the new SB row
@@ -2957,7 +3155,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
   // Code each SB in the row
   for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    MACROBLOCK *x = &cpi->mb;
     int dummy_rate = 0;
     int64_t dummy_dist = 0;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
@@ -2970,7 +3167,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     vp9_zero(x->pred_mv);
 
     // Set the partition type of the 64X64 block
-    switch (cpi->sf.partition_search_type) {
+    switch (sf->partition_search_type) {
       case VAR_BASED_PARTITION:
         choose_partitioning(cpi, tile, mi_row, mi_col);
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
@@ -2983,20 +3180,20 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
         break;
       case VAR_BASED_FIXED_PARTITION:
       case FIXED_PARTITION:
-        bsize = cpi->sf.partition_search_type == FIXED_PARTITION ?
-                cpi->sf.always_this_block_size :
+        bsize = sf->partition_search_type == FIXED_PARTITION ?
+                sf->always_this_block_size :
                 get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
         set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
         nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
                             1, &dummy_rate, &dummy_dist, cpi->pc_root);
         break;
       case REFERENCE_PARTITION:
-        if (cpi->sf.partition_check ||
+        if (sf->partition_check ||
             !is_background(cpi, tile, mi_row, mi_col)) {
           set_modeinfo_offsets(cm, xd, mi_row, mi_col);
           auto_partition_range(cpi, tile, mi_row, mi_col,
-                               &cpi->sf.min_partition_size,
-                               &cpi->sf.max_partition_size);
+                               &sf->min_partition_size,
+                               &sf->max_partition_size);
           nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
                                &dummy_rate, &dummy_dist, 1, INT64_MAX,
                                cpi->pc_root);
@@ -3009,14 +3206,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
         break;
       default:
         assert(0);
+        break;
     }
   }
 }
 // end RTC play code
 
 static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
-  SPEED_FEATURES *const sf = &cpi->sf;
-  VP9_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const VP9_COMMON *const cm = &cpi->common;
 
   const uint8_t *src = cpi->Source->y_buffer;
   const uint8_t *last_src = cpi->Last_Source->y_buffer;
@@ -3199,30 +3397,28 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   vp9_zero(rd_opt->tx_select_diff);
   vp9_zero(rd_opt->tx_select_threshes);
 
-  cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
-                           cm->y_dc_delta_q == 0 &&
-                           cm->uv_dc_delta_q == 0 &&
-                           cm->uv_ac_delta_q == 0;
+  xd->lossless = cm->base_qindex == 0 &&
+                 cm->y_dc_delta_q == 0 &&
+                 cm->uv_dc_delta_q == 0 &&
+                 cm->uv_ac_delta_q == 0;
 
   cm->tx_mode = select_tx_mode(cpi);
 
 #if CONFIG_VP9_HIGH
   if (cm->use_high)
-    cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
   else
-    cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_high_fwht4x4 :
-                                                  vp9_high_fdct4x4;
-  cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-  cpi->mb.high_itxm_add = cpi->mb.e_mbd.lossless ? vp9_high_iwht4x4_add :
-                                                   vp9_high_idct4x4_add;
+    x->fwd_txm4x4 = xd->lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+  x->high_itxm_add = xd->lossless ? vp9_high_iwht4x4_add : vp9_high_idct4x4_add;
 #else
-  cpi->mb.fwd_txm4x4 = cpi->mb.e_mbd.lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
 #endif
-  cpi->mb.itxm_add = cpi->mb.e_mbd.lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
-  if (cpi->mb.e_mbd.lossless) {
-    cpi->mb.optimize = 0;
-    cpi->common.lf.filter_level = 0;
+  if (xd->lossless) {
+    x->optimize = 0;
+    cm->lf.filter_level = 0;
     cpi->zbin_mode_boost_enabled = 0;
   }
 
@@ -3234,7 +3430,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   set_prev_mi(cm);
 
   x->quant_fp = cpi->sf.use_quant_fp;
-  x->skip_txfm = 0;
+  vp9_zero(x->skip_txfm);
   if (sf->use_nonrd_pick_mode) {
     // Initialize internal buffer pointers for rtc coding, where non-RD
     // mode decision is used and hence no buffer pointer swap needed.
@@ -3474,7 +3670,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   MODE_INFO **mi_8x8 = xd->mi;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
-  unsigned int segment_id = mbmi->segment_id;
+  const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
+                                             SEG_LVL_SKIP);
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
@@ -3484,6 +3681,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                    cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
                    cpi->sf.allow_skip_recode;
 
+  if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
+    vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -3518,7 +3718,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!cpi->sf.reuse_inter_pred_sby)
+    if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
@@ -3529,8 +3729,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
     } else {
       mbmi->skip = 1;
-      if (output_enabled &&
-          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (output_enabled && !seg_skip)
         cm->counts.skip[vp9_get_skip_context(xd)][1]++;
       reset_skip_context(xd, MAX(bsize, BLOCK_8X8));
     }
@@ -3539,9 +3738,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
-        !(is_inter_block(mbmi) &&
-            (mbmi->skip ||
-             vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
+        !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
       ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
                       &cm->counts.tx)[mbmi->tx_size];
     } else {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 5bfe60c45..f50a0dbb8 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -32,7 +32,7 @@ struct optimize_ctx {
 struct encode_b_args {
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  unsigned char *skip;
+  int8_t *skip;
 };
 
 void vp9_subtract_block_c(int rows, int cols,
@@ -465,6 +465,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
       break;
     default:
       assert(0);
+      break;
   }
 }
 
@@ -543,6 +544,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
       break;
     default:
       assert(0);
+      break;
   }
 }
 
@@ -630,6 +632,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
       break;
     default:
       assert(0);
+      break;
   }
 }
 
@@ -658,22 +661,22 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
     return;
   }
 
-  if (x->skip_txfm == 0) {
-    // full forward transform and quantization
-    if (!x->skip_recode) {
+  if (!x->skip_recode) {
+    if (x->skip_txfm[plane] == 0) {
+      // full forward transform and quantization
       if (x->quant_fp)
         vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
       else
         vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+    } else if (x->skip_txfm[plane] == 2) {
+      // fast path forward transform and quantization
+      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+    } else {
+      // skip forward transform
+      p->eobs[block] = 0;
+      *a = *l = 0;
+      return;
     }
-  } else if (x->skip_txfm == 2) {
-    // fast path forward transform and quantization
-    vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-  } else {
-    // skip forward transform
-    p->eobs[block] = 0;
-    *a = *l = 0;
-    return;
   }
 
   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
@@ -735,6 +738,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
       break;
     default:
       assert(0 && "Invalid transform size");
+      break;
   }
 }
 
@@ -1022,6 +1026,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       break;
     default:
       assert(0);
+      break;
   }
   if (*eob)
     *(args->skip) = 0;
@@ -1029,7 +1034,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            unsigned char *skip) {
+                            int8_t *skip) {
   struct encode_b_args arg = {x, NULL, skip};
   encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 0b8c3d2b0..199971865 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -33,7 +33,7 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            unsigned char *skip);
+                            int8_t *skip);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1922e3cee..4a019928c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,9 +65,6 @@ void vp9_coef_tree_initialize();
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
-#ifdef OUTPUT_YUV_SRC
-FILE *yuv_file;
-#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
@@ -135,7 +132,8 @@ static void setup_frame(VP9_COMP *cpi) {
   }
 
   if (cm->frame_type == KEY_FRAME) {
-    cpi->refresh_golden_frame = 1;
+    if (!is_spatial_svc(cpi))
+      cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
   } else {
     cm->fc = cm->frame_contexts[cm->frame_context_idx];
@@ -497,7 +495,7 @@ static void update_frame_size(VP9_COMP *cpi) {
   vp9_init_context_buffers(cm);
   init_macroblockd(cm, xd);
 
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+  if (is_spatial_svc(cpi)) {
     if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
@@ -511,7 +509,7 @@ static void update_frame_size(VP9_COMP *cpi) {
 }
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
-  cpi->oxcf.framerate = framerate < 0.1 ? 30 : framerate;
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
   vp9_rc_update_framerate(cpi);
 }
 
@@ -544,6 +542,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
 
   cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
@@ -579,23 +578,6 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   set_tile_limits(cpi);
 }
 
-static int get_pass(MODE mode) {
-  switch (mode) {
-    case REALTIME:
-    case ONE_PASS_GOOD:
-    case ONE_PASS_BEST:
-      return 0;
-
-    case TWO_PASS_FIRST:
-      return 1;
-
-    case TWO_PASS_SECOND_GOOD:
-    case TWO_PASS_SECOND_BEST:
-      return 2;
-  }
-  return -1;
-}
-
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -607,7 +589,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   assert(cm->bit_depth <= VPX_BITS_12);
 
   cpi->oxcf = *oxcf;
-  cpi->pass = get_pass(cpi->oxcf.mode);
 #if CONFIG_VP9_HIGH
   if (cpi->oxcf.use_high) {
     cpi->mb.e_mbd.bps = bit_depth_to_bps(cm->bit_depth);
@@ -660,7 +641,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   rc->buffer_level = MIN(rc->buffer_level, rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
-  vp9_new_framerate(cpi, cpi->oxcf.framerate);
+  vp9_new_framerate(cpi, cpi->framerate);
 
   // Set absolute upper and lower quality limits
   rc->worst_quality = cpi->oxcf.worst_allowed_q;
@@ -682,7 +663,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
+      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -705,7 +686,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   high_set_var_fns(cpi);
 #endif
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
                        cm->subsampling_x, cm->subsampling_y,
@@ -1316,7 +1297,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   cpi->use_svc = 0;
 
   init_config(cpi, oxcf);
-  vp9_rc_init(&cpi->oxcf, cpi->pass, &cpi->rc);
+  vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
 
@@ -1368,7 +1349,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   // pending further tuning and testing. The code is left in place here
   // as a place holder in regard to the required paths.
   cpi->multi_arf_last_grp_enabled = 0;
-  if (cpi->pass == 2) {
+  if (oxcf->pass == 2) {
     if (cpi->use_svc) {
       cpi->multi_arf_allowed = 0;
       cpi->multi_arf_enabled = 0;
@@ -1435,14 +1416,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   cpi->mb.nmvsadcost_hp[1] = &cpi->mb.nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
-#ifdef OUTPUT_YUV_SRC
-  yuv_file = fopen("bd.yuv", "ab");
-#endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
@@ -1456,9 +1434,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
-  if (cpi->pass == 1) {
+  if (oxcf->pass == 1) {
     vp9_init_first_pass(cpi);
-  } else if (cpi->pass == 2) {
+  } else if (oxcf->pass == 2) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
@@ -1638,7 +1616,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vp9_clear_system_state();
 
     // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
-    if (cpi->pass != 1) {
+    if (cpi->oxcf.pass != 1) {
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded = (cpi->last_end_time_stamp_seen
                              - cpi->first_time_stamp_ever) / 10000000.000;
@@ -1694,7 +1672,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #endif
   }
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_free(&(cpi->denoiser));
   }
@@ -1718,14 +1696,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   vp9_remove_common(&cpi->common);
   vpx_free(cpi);
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   fclose(yuv_denoised_file);
 #endif
 #endif
-#ifdef OUTPUT_YUV_SRC
-  fclose(yuv_file);
-#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
@@ -1990,16 +1965,6 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
   }
 }
 
-int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9_COMMON *cm = &cpi->common;
-
-  if (index < 0 || index >= REF_FRAMES)
-    return -1;
-
-  *fb = &cm->frame_bufs[cm->ref_frame_map[index]].buf;
-  return 0;
-}
-
 int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
@@ -2017,36 +1982,7 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) {
   return 0;
 }
 
-
-#if defined(OUTPUT_YUV_SRC)
-void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #if defined(OUTPUT_YUV_DENOISED)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
@@ -2136,17 +2072,6 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
     src += s->uv_stride;
   } while (--h);
 
-#if CONFIG_ALPHA
-  if (s->alpha_buffer) {
-    src = s->alpha_buffer;
-    h = s->alpha_crop_height;
-    do {
-      fwrite(src, s->alpha_crop_width, 1,  yuv_rec_file);
-      src += s->alpha_stride;
-    } while (--h);
-  }
-#endif
-
   fflush(yuv_rec_file);
 }
 #endif
@@ -2162,22 +2087,18 @@ static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
 #endif
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
-  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                  src->alpha_buffer};
-  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                              src->alpha_stride};
-  const int src_widths[4] = {src->y_crop_width, src->uv_crop_width,
-                             src->uv_crop_width, src->y_crop_width};
-  const int src_heights[4] = {src->y_crop_height, src->uv_crop_height,
-                              src->uv_crop_height, src->y_crop_height};
-  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
-                            dst->alpha_buffer};
-  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
-                              dst->alpha_stride};
-  const int dst_widths[4] = {dst->y_crop_width, dst->uv_crop_width,
-                             dst->uv_crop_width, dst->y_crop_width};
-  const int dst_heights[4] = {dst->y_crop_height, dst->uv_crop_height,
-                              dst->uv_crop_height, dst->y_crop_height};
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width };
+  const int src_heights[3] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
+  const int dst_widths[3] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width};
+  const int dst_heights[3] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height};
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
 #if CONFIG_VP9_HIGH
@@ -2210,14 +2131,10 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                  src->alpha_buffer};
-  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                              src->alpha_stride};
-  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
-                            dst->alpha_buffer};
-  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
-                              dst->alpha_stride};
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
   const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP);
   int x, y, i;
 
@@ -2338,17 +2255,15 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
     ref_cnt_fb(cm->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
-  } else if (!cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-             cpi->rc.is_src_frame_alt_ref && !cpi->use_svc) {
-    /* Preserve the previously existing golden frame and update the frame in
-     * the alt ref slot instead. This is highly specific to the current use of
-     * alt-ref as a forward reference, and this needs to be generalized as
-     * other uses are implemented (like RTC/temporal scaling)
-     *
-     * The update to the buffer in the alt ref slot was signaled in
-     * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated
-     * as the golden frame next time.
-     */
+  } else if (vp9_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term in function
+    // vp9_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // we're updating the GF with the current decoded frame, we save it to the
+    // ARF slot instead.
+    // We now have to update the ARF with the current frame and swap gld_fb_idx
+    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+    // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
     ref_cnt_fb(cm->frame_bufs,
@@ -2357,10 +2272,15 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
+
+    if (is_spatial_svc(cpi)) {
+      cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
+      cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
+    }
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
-      if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
         const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
@@ -2379,7 +2299,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     ref_cnt_fb(cm->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
   }
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp9_denoiser_update_frame_info(&cpi->denoiser,
                                    *cpi->Source,
@@ -2510,8 +2430,8 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
         cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex),
         (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
+        vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality),
         cpi->rc.avg_q,
-        vp9_convert_qindex_to_q(cpi->rc.ni_av_qi),
         vp9_convert_qindex_to_q(cpi->oxcf.cq_level),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
         cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
@@ -2796,7 +2716,8 @@ static void get_ref_frame_flags(VP9_COMP *cpi) {
   if (cpi->gold_is_last)
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX)
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      !is_spatial_svc(cpi))
     cpi->ref_frame_flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->alt_is_last)
@@ -2847,9 +2768,7 @@ static void configure_skippable_frame(VP9_COMP *cpi) {
   // according to the variance
 
   SVC *const svc = &cpi->svc;
-  const int is_spatial_svc = (svc->number_spatial_layers > 1) &&
-                             (svc->number_temporal_layers == 1);
-  TWO_PASS *const twopass = is_spatial_svc ?
+  TWO_PASS *const twopass = is_spatial_svc(cpi) ?
                             &svc->layer_context[svc->spatial_layer_id].twopass
                             : &cpi->twopass;
 
@@ -2867,7 +2786,7 @@ static void set_arf_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
 
-  if ((cpi->pass == 2) && cpi->multi_arf_allowed) {
+  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     arf_sign_bias = cpi->rc.source_alt_ref_active &&
                     (!cpi->refresh_alt_ref_frame ||
@@ -2955,9 +2874,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       (cpi->oxcf.frame_parallel_decoding_mode != 0);
 
     // By default, encoder assumes decoder can use prev_mi.
-    cm->coding_use_prev_mi = 1;
     if (cm->error_resilient_mode) {
-      cm->coding_use_prev_mi = 0;
       cm->frame_parallel_decoding_mode = 1;
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
@@ -2971,19 +2888,19 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // static regions if indicated.
   // Only allowed in second pass of two pass (as requires lagged coding)
   // and if the relevant speed feature flag is set.
-  if (cpi->pass == 2 && cpi->sf.static_segmentation)
+  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
 
   // Check if the current frame is skippable for the partition search in the
   // second pass according to the first pass stats
-  if (cpi->pass == 2 &&
-      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+  if (cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_spatial_svc(cpi))) {
     configure_skippable_frame(cpi);
   }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
-  if (cpi->pass == 0 &&
+  if (cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cm->frame_type != KEY_FRAME) {
     if (vp9_rc_drop_frame(cpi)) {
@@ -3020,21 +2937,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 #endif
 
-#ifdef OUTPUT_YUV_SRC
-  vp9_write_yuv_frame(cpi->Source, yuv_file);
-#endif
-
   set_speed_features(cpi);
 
-#if CONFIG_DENOISING
-#ifdef OUTPUT_YUV_DENOISED
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
-                            yuv_denoised_file);
-  }
-#endif
-#endif
-
   // Decide q and q bounds.
   q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
 
@@ -3050,6 +2954,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
   }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+  if (cpi->oxcf.noise_sensitivity > 0) {
+    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
+                            yuv_denoised_file);
+  }
+#endif
+#endif
+
+
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
@@ -3246,8 +3160,6 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
 #if CONFIG_VP9_HIGH
   const int use_high = sd->flags & YV12_FLAG_HIGH;
 #endif
-  const int is_spatial_svc = cpi->use_svc &&
-                             (cpi->svc.number_temporal_layers == 1);
 
 #if CONFIG_VP9_HIGH
   check_initial_width(cpi, subsampling_x, subsampling_y, use_high);
@@ -3257,8 +3169,8 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
 
   vpx_usec_timer_start(&timer);
 
-#ifdef CONFIG_SPATIAL_SVC
-  if (is_spatial_svc)
+#if CONFIG_SPATIAL_SVC
+  if (is_spatial_svc(cpi))
     res = vp9_svc_lookahead_push(cpi, cpi->lookahead, sd, time_stamp, end_time,
                                  frame_flags);
   else
@@ -3327,7 +3239,7 @@ void adjust_frame_rate(VP9_COMP *cpi) {
       // over the whole interval seen.
       const double interval = MIN((double)(cpi->source->ts_end
                                    - cpi->first_time_stamp_ever), 10000000.0);
-      double avg_duration = 10000000.0 / cpi->oxcf.framerate;
+      double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
 
@@ -3344,7 +3256,7 @@ static int get_arf_src_index(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int arf_src_index = 0;
   if (is_altref_enabled(cpi)) {
-    if (cpi->pass == 2) {
+    if (cpi->oxcf.pass == 2) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
         arf_src_index = gf_group->arf_src_offset[gf_group->index];
@@ -3359,7 +3271,7 @@ static int get_arf_src_index(VP9_COMP *cpi) {
 static void check_src_altref(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
-  if (cpi->pass == 2) {
+  if (cpi->oxcf.pass == 2) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     rc->is_src_frame_alt_ref =
       (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
@@ -3388,15 +3300,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   MV_REFERENCE_FRAME ref_frame;
   int arf_src_index;
-  const int is_spatial_svc = cpi->use_svc &&
-                             (cpi->svc.number_temporal_layers == 1) &&
-                             (cpi->svc.number_spatial_layers > 1);
 
   if (!cpi)
     return -1;
 
-  if (is_spatial_svc && cpi->pass == 2) {
+  if (is_spatial_svc(cpi) && cpi->oxcf.pass == 2) {
+#if CONFIG_SPATIAL_SVC
     vp9_svc_lookahead_peek(cpi, cpi->lookahead, 0, 1);
+#endif
     vp9_restore_layer_context(cpi);
   }
 
@@ -3419,8 +3330,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
 
-#ifdef CONFIG_SPATIAL_SVC
-    if (is_spatial_svc)
+#if CONFIG_SPATIAL_SVC
+    if (is_spatial_svc(cpi))
       cpi->source = vp9_svc_lookahead_peek(cpi, cpi->lookahead,
                                            arf_src_index, 0);
     else
@@ -3429,8 +3340,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     if (cpi->source != NULL) {
       cpi->alt_ref_source = cpi->source;
 
-#ifdef CONFIG_SPATIAL_SVC
-      if (is_spatial_svc && cpi->svc.spatial_layer_id > 0) {
+#if CONFIG_SPATIAL_SVC
+      if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
         int i;
         // Reference a hidden frame from a lower layer
         for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
@@ -3464,8 +3375,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (!cpi->source) {
     // Get last frame source.
     if (cm->current_video_frame > 0) {
-#ifdef CONFIG_SPATIAL_SVC
-      if (is_spatial_svc)
+#if CONFIG_SPATIAL_SVC
+      if (is_spatial_svc(cpi))
         cpi->last_source = vp9_svc_lookahead_peek(cpi, cpi->lookahead, -1, 0);
       else
 #endif
@@ -3475,8 +3386,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
 
     // Read in the source frame.
-#ifdef CONFIG_SPATIAL_SVC
-    if (is_spatial_svc)
+#if CONFIG_SPATIAL_SVC
+    if (is_spatial_svc(cpi))
       cpi->source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
 #endif
@@ -3507,7 +3418,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   } else {
     *size = 0;
-    if (flush && cpi->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && cpi->oxcf.pass == 1 && !cpi->twopass.first_pass_done) {
       vp9_end_first_pass(cpi);    /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
@@ -3519,6 +3430,9 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     cpi->last_end_time_stamp_seen = cpi->source->ts_start;
   }
 
+  // Clear down mmx registers
+  vp9_clear_system_state();
+
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
     adjust_frame_rate(cpi);
@@ -3533,9 +3447,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // start with a 0 size frame
   *size = 0;
 
-  // Clear down mmx registers
-  vp9_clear_system_state();
-
   /* find a free buffer for the new frame, releasing the reference previously
    * held.
    */
@@ -3545,7 +3456,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
-    } else if (cpi->pass == 2) {
+    } else if (cpi->oxcf.pass == 2) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
     }
@@ -3553,7 +3464,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   cpi->frame_flags = *frame_flags;
 
-  if (cpi->pass == 2 &&
+  if (cpi->oxcf.pass == 2 &&
       cm->current_video_frame == 0 &&
       cpi->oxcf.allow_spatial_resampling &&
       cpi->oxcf.rc_mode == VPX_VBR) {
@@ -3601,8 +3512,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_vaq_init();
   }
 
-  if (cpi->pass == 1 &&
-      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+  if (cpi->oxcf.pass == 1 &&
+      (!cpi->use_svc || is_spatial_svc(cpi))) {
     const int lossless = is_lossless_requested(&cpi->oxcf);
 #if CONFIG_VP9_HIGH
     if (cpi->oxcf.use_high)
@@ -3616,8 +3527,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #endif
     cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi);
-  } else if (cpi->pass == 2 &&
-      (!cpi->use_svc || cpi->svc.number_temporal_layers == 1)) {
+  } else if (cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_spatial_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -3641,19 +3552,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Save layer specific state.
   if ((cpi->svc.number_temporal_layers > 1 &&
       cpi->oxcf.rc_mode == VPX_CBR) ||
-      (cpi->svc.number_spatial_layers > 1 && cpi->pass == 2)) {
+      (cpi->svc.number_spatial_layers > 1 && cpi->oxcf.pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
-  if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+  if (cpi->b_calculate_psnr && cpi->oxcf.pass != 1 && cm->show_frame)
     generate_psnr_packet(cpi);
 
 #if CONFIG_INTERNAL_STATS
 
-  if (cpi->pass != 1) {
+  if (cpi->oxcf.pass != 1) {
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
@@ -3920,6 +3831,7 @@ int vp9_high_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
       return (int) sse;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 }
 #endif
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8f6d1e427..95c07d344 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -37,7 +37,7 @@
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
 
@@ -135,7 +135,7 @@ typedef struct VP9EncoderConfig {
   int width;  // width of data passed to the compressor
   int height;  // height of data passed to the compressor
   unsigned int in_bit_depth;  // input bit depth
-  double framerate;  // set to passed in framerate
+  double init_framerate;  // set to passed in framerate
   int64_t target_bandwidth;  // bandwidth to be used in kilobits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
@@ -144,6 +144,7 @@ typedef struct VP9EncoderConfig {
   unsigned int rc_max_intra_bitrate_pct;
 
   MODE mode;
+  int pass;
 
   // Key Framing Operations
   int auto_key;  // autodetect cut scenes and set the keyframes
@@ -235,6 +236,7 @@ typedef struct VP9EncoderConfig {
   vp8e_tuning tuning;
   /* Flag to say whether we are using 16bit frame buffers */
   int use_high;
+  vp9e_tune_content content;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -307,6 +309,7 @@ typedef struct VP9_COMP {
   int64_t first_time_stamp_ever;
 
   RATE_CONTROL rc;
+  double framerate;
 
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
 
@@ -315,9 +318,6 @@ typedef struct VP9_COMP {
   MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
   int mbgraph_n_frames;             // number of frames filled in the above
   int static_mb_pct;                // % forced skip mbs by segmentation
-
-  int pass;
-
   int ref_frame_flags;
 
   SPEED_FEATURES sf;
@@ -433,7 +433,7 @@ typedef struct VP9_COMP {
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
 } VP9_COMP;
@@ -465,9 +465,6 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags);
 int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd);
 
-int vp9_get_reference_enc(VP9_COMP *cpi, int index,
-                          YV12_BUFFER_CONFIG **fb);
-
 int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd);
 
@@ -512,9 +509,8 @@ static INLINE int frame_is_boosted(const VP9_COMP *cpi) {
 }
 
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
-  // TODO(JBB): make this work for alpha channel and double check we can't
-  // exceed this token count if we have a 32x32 transform crossing a boundary
-  // at a multiple of 16.
+  // TODO(JBB): double check we can't exceed this token count if we have a
+  // 32x32 transform crossing a boundary at a multiple of 16.
   // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
   // resolution. We assume up to 1 token per pixel, and then allow
   // a head room of 4.
@@ -544,10 +540,16 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
+static INLINE int is_spatial_svc(const struct VP9_COMP *const cpi) {
+  return cpi->use_svc &&
+         cpi->svc.number_temporal_layers == 1 &&
+         cpi->svc.number_spatial_layers > 1;
+}
+
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
          (cpi->oxcf.play_alternate &&
-          (!(cpi->use_svc && cpi->svc.number_temporal_layers == 1) ||
+          (!is_spatial_svc(cpi) ||
            cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
 }
 
@@ -560,8 +562,8 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
                                                          : 0];
 }
 
-static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
-  return cm->current_video_frame % 2;
+static INLINE int get_chessboard_index(const int frame_index) {
+  return frame_index & 0x1;
 }
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_extend.c b/vp9/encoder/vp9_extend.c
index 55d16e6bd..e1ab83b90 100644
--- a/vp9/encoder/vp9_extend.c
+++ b/vp9/encoder/vp9_extend.c
@@ -121,18 +121,6 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int eb_uv = eb_y >> uv_height_subsampling;
   const int er_uv = er_y >> uv_width_subsampling;
 
-#if CONFIG_ALPHA
-  const int et_a = dst->border >> (dst->alpha_height != dst->y_height);
-  const int el_a = dst->border >> (dst->alpha_width != dst->y_width);
-  const int eb_a = et_a + dst->alpha_height - src->alpha_height;
-  const int er_a = el_a + dst->alpha_width - src->alpha_width;
-
-  copy_and_extend_plane(src->alpha_buffer, src->alpha_stride,
-                        dst->alpha_buffer, dst->alpha_stride,
-                        src->alpha_width, src->alpha_height,
-                        et_a, el_a, eb_a, er_a);
-#endif
-
 #if CONFIG_VP9_HIGH
   if (src->flags & YV12_FLAG_HIGH) {
     copy_and_extend_plane_high(src->y_buffer, src->y_stride,
@@ -152,6 +140,7 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
     return;
   }
 #endif
+
   copy_and_extend_plane(src->y_buffer, src->y_stride,
                         dst->y_buffer, dst->y_stride,
                         src->y_width, src->y_height,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 9bd83d214..e367f285f 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -22,7 +22,6 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"  // vp9_setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
-
 #include "vp9/encoder/vp9_aq_variance.h"
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
@@ -56,7 +55,6 @@
 
 #define MIN_KF_BOOST        300
 #define MIN_GF_INTERVAL     4
-#define LONG_TERM_VBR_CORRECTION
 
 static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
   YV12_BUFFER_CONFIG temp = *a;
@@ -225,26 +223,6 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->duration   -= frame->duration;
 }
 
-static void avg_stats(FIRSTPASS_STATS *section) {
-  if (section->count < 1.0)
-    return;
-
-  section->intra_error /= section->count;
-  section->coded_error /= section->count;
-  section->sr_coded_error /= section->count;
-  section->pcnt_inter  /= section->count;
-  section->pcnt_second_ref /= section->count;
-  section->pcnt_neutral /= section->count;
-  section->pcnt_motion /= section->count;
-  section->MVr        /= section->count;
-  section->mvr_abs     /= section->count;
-  section->MVc        /= section->count;
-  section->mvc_abs     /= section->count;
-  section->MVrv       /= section->count;
-  section->MVcv       /= section->count;
-  section->mv_in_out_count   /= section->count;
-  section->duration   /= section->count;
-}
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
@@ -278,7 +256,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+  if (is_spatial_svc(cpi)) {
     int i;
     for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
       output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
@@ -526,7 +504,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+  if (is_spatial_svc(cpi)) {
     MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
     const YV12_BUFFER_CONFIG *scaled_ref_buf = NULL;
     twopass = &cpi->svc.layer_context[cpi->svc.spatial_layer_id].twopass;
@@ -613,6 +591,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
       double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
 
       vp9_clear_system_state();
 
@@ -676,7 +657,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
 #if CONFIG_FP_MB_STATS
       if (cpi->use_fp_mb_stats) {
-        // TODO(pengchong): store some related block statistics here
+        // initialization
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
       }
 #endif
 
@@ -727,8 +709,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
                                                 &unscaled_last_source_buf_2d);
 #endif
         // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 ||
-            (cpi->use_svc && cpi->svc.number_temporal_layers == 1)) {
+        if (raw_motion_error > 25 || is_spatial_svc(cpi)) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
           first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
@@ -805,6 +786,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
         // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
 
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // intra predication statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
         if (motion_error <= this_error) {
           // Keep a count of cases where the inter and intra were very close
           // and very low. This helps with scene cut detection for example in
@@ -835,13 +830,50 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
-            // TODO(pengchong): save some related block statistics here
+            // inter predication statistics
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+            if (this_error > FPMB_ERROR_LARGE_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_LARGE_MASK;
+            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_SMALL_MASK;
+            }
           }
 #endif
 
           if (mv.as_int) {
             ++mvcount;
 
+#if CONFIG_FP_MB_STATS
+            if (cpi->use_fp_mb_stats) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                  ~FPMB_MOTION_ZERO_MASK;
+              // check estimated motion direction
+              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+                // right direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_RIGHT_MASK;
+              } else if (mv.as_mv.row < 0 &&
+                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+                // up direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_UP_MASK;
+              } else if (mv.as_mv.col < 0 &&
+                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+                // left direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_LEFT_MASK;
+              } else {
+                // down direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_DOWN_MASK;
+              }
+            }
+#endif
+
             // Non-zero vector, was it different from the last non zero vector?
             if (mv.as_int != lastmv_as_int)
               ++new_mv_count;
@@ -968,7 +1000,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_extend_frame_borders(new_yv12);
 
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+  if (is_spatial_svc(cpi)) {
     vp9_update_reference_frames(cpi);
   } else {
     // Swap frame pointers so last frame refers to the frame we just compressed.
@@ -1035,13 +1067,11 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
     const double err_per_mb = section_err / num_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
     const int target_norm_bits_per_mb = ((uint64_t)section_target_bandwidth <<
-                                            BPER_MB_NORMBITS) / num_mbs;
+                                         BPER_MB_NORMBITS) / num_mbs;
     int q;
     int is_svc_upper_layer = 0;
-    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
-        cpi->svc.spatial_layer_id > 0) {
+    if (is_spatial_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
-    }
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
@@ -1157,6 +1187,19 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm,
   return MIN(second_ref_decay, next_frame->pcnt_inter);
 }
 
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const VP9_COMMON *cm,
+                                     const FIRSTPASS_STATS *frame) {
+  const double sr_ratio = frame->coded_error /
+                          DOUBLE_DIVIDE_CHECK(frame->sr_coded_error);
+  const double zero_motion_pct = frame->pcnt_inter -
+                                 frame->pcnt_motion;
+
+  return MIN(sr_ratio, zero_motion_pct);
+}
+
+
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
@@ -1681,11 +1724,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
-      if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
-          zero_motion_accumulator) {
-        zero_motion_accumulator = next_frame.pcnt_inter -
-                                      next_frame.pcnt_motion;
-      }
+      zero_motion_accumulator =
+        MIN(zero_motion_accumulator,
+            get_zero_motion_factor(&cpi->common, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
@@ -2056,11 +2097,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       break;
 
     // Monitor for static sections.
-    if ((next_frame.pcnt_inter - next_frame.pcnt_motion) <
-            zero_motion_accumulator) {
-      zero_motion_accumulator = (next_frame.pcnt_inter -
-                                     next_frame.pcnt_motion);
-    }
+    zero_motion_accumulator =
+      MIN(zero_motion_accumulator,
+          get_zero_motion_factor(&cpi->common, &next_frame));
 
     // For the first few frames collect data to decide kf boost.
     if (i <= (rc->max_gf_interval * 2)) {
@@ -2175,9 +2214,11 @@ void configure_buffer_updates(VP9_COMP *cpi) {
       break;
     default:
       assert(0);
+      break;
   }
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
-    cpi->refresh_golden_frame = 0;
+  if (is_spatial_svc(cpi)) {
+    if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
+      cpi->refresh_golden_frame = 0;
     if (cpi->alt_ref_source == NULL)
       cpi->refresh_alt_ref_frame = 0;
   }
@@ -2194,9 +2235,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   int target_rate;
   LAYER_CONTEXT *lc = NULL;
-  const int is_spatial_svc = (cpi->use_svc &&
-                              cpi->svc.number_temporal_layers == 1);
-  if (is_spatial_svc) {
+
+  if (is_spatial_svc(cpi)) {
     lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
     frames_left = (int)(twopass->total_stats.count -
                   lc->current_video_frame_in_layer);
@@ -2216,15 +2256,15 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     target_rate = twopass->gf_group.bit_allocation[twopass->gf_group.index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
-#ifdef LONG_TERM_VBR_CORRECTION
+
     // Correction to rate target based on prior over or under shoot.
     if (cpi->oxcf.rc_mode == VPX_VBR)
       vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
-#endif
+
     vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
-    if (is_spatial_svc) {
+    if (is_spatial_svc(cpi)) {
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
@@ -2240,7 +2280,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   vp9_clear_system_state();
 
-  if (is_spatial_svc && twopass->kf_intra_err_min == 0) {
+  if (is_spatial_svc(cpi) && twopass->kf_intra_err_min == 0) {
     twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
     twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
   }
@@ -2248,7 +2288,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
   } else if (cm->current_video_frame == 0 ||
-             (is_spatial_svc && lc->current_video_frame_in_layer == 0)) {
+             (is_spatial_svc(cpi) &&
+              lc->current_video_frame_in_layer == 0)) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
@@ -2274,9 +2315,12 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     cm->frame_type = INTER_FRAME;
   }
 
-  if (is_spatial_svc) {
+  if (is_spatial_svc(cpi)) {
     if (cpi->svc.spatial_layer_id == 0) {
       lc->is_key_frame = (cm->frame_type == KEY_FRAME);
+      if (lc->is_key_frame)
+        cpi->ref_frame_flags &=
+            (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
     } else {
       cm->frame_type = INTER_FRAME;
       lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
@@ -2302,7 +2346,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     }
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (!is_spatial_svc)
+    if (!is_spatial_svc(cpi))
       cpi->refresh_golden_frame = 1;
   }
 
@@ -2315,11 +2359,11 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
   rc->base_frame_target = target_rate;
-#ifdef LONG_TERM_VBR_CORRECTION
+
   // Correction to rate target based on prior over or under shoot.
   if (cpi->oxcf.rc_mode == VPX_VBR)
     vbr_rate_correction(&target_rate, rc->vbr_bits_off_target);
-#endif
+
   vp9_rc_set_frame_target(cpi, target_rate);
 
   // Update the total stats remaining structure.
@@ -2329,45 +2373,19 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
-#ifdef LONG_TERM_VBR_CORRECTION
-  // In this experimental mode, the VBR correction is done exclusively through
-  // rc->vbr_bits_off_target. Based on the sign of this value, a limited %
-  // adjustment is made to the target rate of subsequent frames, to try and
-  // push it back towards 0. This mode is less likely to suffer from
-  // extreme behaviour at the end of a clip or group of frames.
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
   const int bits_used = rc->base_frame_target;
   rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
-#else
-  // In this mode, VBR correction is acheived by altering bits_left,
-  // kf_group_bits & gf_group_bits to reflect any deviation from the target
-  // rate in this frame. This alters the allocation of bits to the
-  // remaning frames in the group / clip.
-  //
-  // This method can give rise to unstable behaviour near the end of a clip
-  // or kf/gf group of frames where any accumulated error is corrected over an
-  // ever decreasing number of frames. Hence we change the balance of target
-  // vs. actual bitrate gradually as we progress towards the end of the
-  // sequence in order to mitigate this effect.
-  const double progress =
-      (double)(twopass->stats_in - twopass->stats_in_start) /
-              (twopass->stats_in_end - twopass->stats_in_start);
-  const int bits_used = (int)(progress * rc->this_frame_target +
-                             (1.0 - progress) * rc->projected_frame_size);
-#endif
 
   twopass->bits_left = MAX(twopass->bits_left - bits_used, 0);
 
-#ifdef LONG_TERM_VBR_CORRECTION
   if (cpi->common.frame_type != KEY_FRAME &&
       !vp9_is_upper_layer_key_frame(cpi)) {
-#else
-  if (cpi->common.frame_type == KEY_FRAME ||
-      vp9_is_upper_layer_key_frame(cpi)) {
-    // For key frames kf_group_bits already had the target bits subtracted out.
-    // So now update to the correct value based on the actual bits used.
-    twopass->kf_group_bits += rc->this_frame_target - bits_used;
-  } else {
-#endif
     twopass->kf_group_bits -= bits_used;
   }
   twopass->kf_group_bits = MAX(twopass->kf_group_bits, 0);
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 33a795f26..bf8c9fd96 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -19,6 +19,20 @@ extern "C" {
 #endif
 
 #if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
 typedef struct {
   uint8_t *mb_stats_start;
   uint8_t *mb_stats_end;
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index bbe509e54..b8234412a 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -14,7 +14,7 @@
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #endif
@@ -31,7 +31,7 @@ struct lookahead_entry {
   int64_t             ts_end;
   unsigned int        flags;
 
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
   vpx_svc_parameters_t svc_params[VPX_SS_MAX_LAYERS];
 #endif
 };
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index b1eea18b4..a6bcd533b 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -24,7 +24,7 @@
 #include "vp9/encoder/vp9_quantize.h"
 
 static int get_max_filter_level(const VP9_COMP *cpi) {
-  if (cpi->pass == 2) {
+  if (cpi->oxcf.pass == 2) {
     return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
                                                  : MAX_LOOP_FILTER;
   } else {
@@ -89,7 +89,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     // Bias against raising loop filter in favor of lowering it.
     int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    if ((cpi->pass == 2) && (cpi->twopass.section_intra_rating < 20))
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
       bias = (bias * cpi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
@@ -174,6 +174,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
       default:
         assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
                     "or VPX_BITS_12");
+        return;
     }
 #else
     filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 74e0d2b58..f4d156587 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -212,11 +212,11 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *sse_y = sse;
 
   if (sse < dc_quant * dc_quant >> 6)
-    x->skip_txfm = 1;
+    x->skip_txfm[0] = 1;
   else if (var < ac_quant * ac_quant >> 6)
-    x->skip_txfm = 2;
+    x->skip_txfm[0] = 2;
   else
-    x->skip_txfm = 0;
+    x->skip_txfm[0] = 0;
 
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
@@ -431,7 +431,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   INTERP_FILTER filter_ref = cm->interp_filter;
   int bsl = mi_width_log2(bsize);
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
-      (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
   int const_motion[MAX_REF_FRAMES] = { 0 };
   int bh = num_4x4_blocks_high_lookup[bsize] << 2;
   int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -449,6 +450,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   PRED_BUFFER *this_mode_pred = NULL;
   int i;
 
+  // CTX is used by the temporal denoiser which is currently being developed.
+  // TODO(jbb): when temporal denoiser is finished and in the default build
+  // remove the following line;
+  (void) ctx;
   if (cpi->sf.reuse_inter_pred_sby) {
     for (i = 0; i < 3; i++) {
 #if CONFIG_VP9_HIGH
@@ -500,7 +505,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
 
-      if (cm->coding_use_prev_mi)
+      if (!cm->error_resilient_mode)
         vp9_find_mv_refs(cm, xd, tile, xd->mi[0], ref_frame,
                          candidates, mi_row, mi_col);
       else
@@ -600,7 +605,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           if (cost < best_cost) {
             best_filter = filter;
             best_cost = cost;
-            skip_txfm = x->skip_txfm;
+            skip_txfm = x->skip_txfm[0];
 
             if (cpi->sf.reuse_inter_pred_sby) {
               if (this_mode_pred != current_pred) {
@@ -626,7 +631,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         dist = pf_dist[mbmi->interp_filter];
         var_y = pf_var[mbmi->interp_filter];
         sse_y = pf_sse[mbmi->interp_filter];
-        x->skip_txfm = skip_txfm;
+        x->skip_txfm[0] = skip_txfm;
       } else {
         mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
         vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
@@ -649,7 +654,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
 
-#if CONFIG_DENOISING
+#if CONFIG_VP9_TEMPORAL_DENOISING
       if (cpi->oxcf.noise_sensitivity > 0) {
         vp9_denoiser_update_frame_stats(&cpi->denoiser, mbmi, sse_y,
                                         this_mode, ctx);
@@ -664,7 +669,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         best_pred_filter = mbmi->interp_filter;
         best_tx_size = mbmi->tx_size;
         best_ref_frame = ref_frame;
-        skip_txfm = x->skip_txfm;
+        skip_txfm = x->skip_txfm[0];
 
         if (cpi->sf.reuse_inter_pred_sby) {
           if (best_pred != NULL)
@@ -715,7 +720,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->ref_frame[0]  = best_ref_frame;
   mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-  x->skip_txfm = skip_txfm;
+  x->skip_txfm[0] = skip_txfm;
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
@@ -724,7 +729,6 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     int i, j;
     const int width  = num_4x4_blocks_wide_lookup[bsize];
     const int height = num_4x4_blocks_high_lookup[bsize];
-    const BLOCK_SIZE bsize_tx = txsize_to_bsize[mbmi->tx_size];
 
     int rate2 = 0;
     int64_t dist2 = 0;
@@ -734,28 +738,36 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
     TX_SIZE tmp_tx_size = MIN(max_txsize_lookup[bsize],
                               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    const BLOCK_SIZE bsize_tx = txsize_to_bsize[tmp_tx_size];
     const int step = 1 << tmp_tx_size;
 
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      if (cpi->sf.reuse_inter_pred_sby) {
-        pd->dst.buf = tmp[0].data;
-        pd->dst.stride = bw;
-      }
+    if (cpi->sf.reuse_inter_pred_sby) {
+      pd->dst.buf = tmp[0].data;
+      pd->dst.stride = bw;
+    }
 
+    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
+      uint8_t *const src_buf_base = p->src.buf;
+      uint8_t *const dst_buf_base = pd->dst.buf;
       for (j = 0; j < height; j += step) {
         for (i = 0; i < width; i += step) {
+          p->src.buf = &src_buf_base[4 * (j * src_stride + i)];
+          pd->dst.buf = &dst_buf_base[4 * (j * dst_stride + i)];
+          // Use source buffer as an approximation for the fully reconstructed
+          // buffer
           vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize),
                                   tmp_tx_size, this_mode,
-                                  &p->src.buf[4 * (j * dst_stride + i)],
-                                  src_stride,
-                                  &pd->dst.buf[4 * (j * dst_stride + i)],
-                                  dst_stride, i, j, 0);
+                                  p->src.buf, src_stride,
+                                  pd->dst.buf, dst_stride,
+                                  i, j, 0);
           model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
           rate2 += rate;
           dist2 += dist;
           ++block_idx;
         }
       }
+      p->src.buf = src_buf_base;
+      pd->dst.buf = dst_buf_base;
 
       rate = rate2;
       dist = dist2;
@@ -777,7 +789,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
       } else {
-        x->skip_txfm = skip_txfm;
+        x->skip_txfm[0] = skip_txfm;
       }
     }
   }
@@ -798,6 +810,7 @@ int vp9_get_intra_cost_penalty(int qindex, int qdelta,
       return ROUND_POWER_OF_TWO(5 * q, 2);
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #else
   return 20 * q;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 56eac75aa..633abb929 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -25,15 +25,14 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
@@ -71,15 +70,15 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
-  int eob = -1;
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int tmp, eob = -1;
 
   if (!skip_block) {
-    const int rc = 0;
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
@@ -558,6 +557,7 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
       return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #else
   return q == 0 ? 64 : (quant < 148 ? 84 : 80);
@@ -599,16 +599,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
       cm->uv_dequant[q][i] = quant;
-
-#if CONFIG_ALPHA
-      // alpha
-      quant = i == 0 ? vp9_dc_quant(q, cm->a_dc_delta_q, cm->bit_depth)
-                     : vp9_ac_quant(q, cm->a_ac_delta_q, cm->bit_depth);
-      invert_quant(&quants->a_quant[q][i], &quants->a_quant_shift[q][i], quant);
-      quants->a_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
-      quants->a_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->a_dequant[q][i] = quant;
-#endif
     }
 
     for (i = 2; i < 8; i++) {
@@ -627,14 +617,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
       cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
-
-#if CONFIG_ALPHA
-      quants->a_quant[q][i] = quants->a_quant[q][1];
-      quants->a_quant_shift[q][i] = quants->a_quant_shift[q][1];
-      quants->a_zbin[q][i] = quants->a_zbin[q][1];
-      quants->a_round[q][i] = quants->a_round[q][1];
-      cm->a_dequant[q][i] = cm->a_dequant[q][1];
-#endif
     }
   }
 }
@@ -657,6 +639,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
+  x->plane[0].quant_thred[0] = cm->y_dequant[qindex][0] *
+                                  cm->y_dequant[qindex][0];
+  x->plane[0].quant_thred[1] = cm->y_dequant[qindex][1] *
+                                  cm->y_dequant[qindex][1];
   x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
   xd->plane[0].dequant = cm->y_dequant[qindex];
 
@@ -668,19 +654,14 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
+    x->plane[i].quant_thred[0] = cm->y_dequant[qindex][0] *
+                                    cm->y_dequant[qindex][0];
+    x->plane[i].quant_thred[1] = cm->y_dequant[qindex][1] *
+                                    cm->y_dequant[qindex][1];
     x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
     xd->plane[i].dequant = cm->uv_dequant[qindex];
   }
 
-#if CONFIG_ALPHA
-  x->plane[3].quant = quants->a_quant[qindex];
-  x->plane[3].quant_shift = quants->a_quant_shift[qindex];
-  x->plane[3].zbin = quants->a_zbin[qindex];
-  x->plane[3].round = quants->a_round[qindex];
-  x->plane[3].zbin_extra = (int16_t)((cm->a_dequant[qindex][1] * zbin) >> 7);
-  xd->plane[3].dequant = cm->a_dequant[qindex];
-#endif
-
   x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index fa1734729..56f731787 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -35,13 +35,6 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
-
-#if CONFIG_ALPHA
-  DECLARE_ALIGNED(16, int16_t, a_quant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_quant_shift[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_zbin[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, a_round[QINDEX_RANGE][8]);
-#endif
 } QUANTS;
 
 void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1bc8ca84a..b100da0cc 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -60,6 +60,7 @@
       default: \
         assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
                     " or VPX_BITS_12"); \
+        name = NULL; \
     } \
   } while (0)
 #else
@@ -162,6 +163,7 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
       return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1.0;
   }
 #else
   return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
@@ -174,7 +176,7 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
   // q based adjustment to baseline enumerator
   enumerator += (int)(enumerator * q) >> 12;
-  return (int)(0.5 + (enumerator * correction_factor / q));
+  return (int)(enumerator * correction_factor / q);
 }
 
 static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
@@ -349,7 +351,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
 
   if (cpi->common.frame_type == KEY_FRAME) {
     return rc->rate_correction_factors[KF_STD];
-  } else if (cpi->pass == 2) {
+  } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
     return rc->rate_correction_factors[rf_lvl];
@@ -368,7 +370,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
 
   if (cpi->common.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
-  } else if (cpi->pass == 2) {
+  } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
     rc->rate_correction_factors[rf_lvl] = factor;
@@ -1027,7 +1029,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
 int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi,
                              int *bottom_index, int *top_index) {
   int q;
-  if (cpi->pass == 0) {
+  if (cpi->oxcf.pass == 0) {
     if (cpi->oxcf.rc_mode == VPX_CBR)
       q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
     else
@@ -1095,7 +1097,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     // this frame refreshes means next frames don't unless specified by user
     rc->frames_since_golden = 0;
 
-    if (cpi->pass == 2) {
+    if (cpi->oxcf.pass == 2) {
       if (!rc->source_alt_ref_pending &&
           cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
       rc->source_alt_ref_active = 0;
@@ -1312,7 +1314,7 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
       ? INT_MAX : (int)(rc->starting_buffer_level / 2);
   } else {
     int kf_boost = 32;
-    double framerate = oxcf->framerate;
+    double framerate = cpi->framerate;
     if (svc->number_temporal_layers > 1 &&
         oxcf->rc_mode == VPX_CBR) {
       // Use the layer framerate for temporal layers CBR mode.
@@ -1340,26 +1342,31 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
 
-    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+    if (is_spatial_svc(cpi)) {
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame = 1;
+      cpi->ref_frame_flags &=
+          (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
     }
 
-    if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
       target = calc_iframe_target_size_one_pass_cbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
 
-    if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+    if (is_spatial_svc(cpi)) {
       LAYER_CONTEXT *lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
       if (cpi->svc.spatial_layer_id == 0) {
         lc->is_key_frame = 0;
       } else {
         lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
+        if (lc->is_key_frame)
+          cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
       }
+      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
     }
 
-    if (cpi->pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
+    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR) {
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
@@ -1466,7 +1473,7 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / oxcf->framerate);
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
   rc->min_frame_bandwidth = (int)(rc->avg_frame_bandwidth *
                                 oxcf->two_pass_vbrmin_section / 100);
 
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 275e7c2fb..ce7ec9637 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -154,7 +154,7 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
   }
 #endif
-  if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
     const int boost_index = MIN(15, (cpi->rc.gfu_boost / 100));
@@ -180,6 +180,7 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
       break;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+      return -1;
   }
 #else
   (void) bit_depth;
@@ -289,8 +290,6 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   }
 }
 
-static const int MAX_XSQ_Q10 = 245727;
-
 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   // NOTE: The tables below must be of the same size.
 
@@ -380,10 +379,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
     *dist = 0;
   } else {
     int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
-    const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
-                        MAX_XSQ_Q10 : (int)xsq_q10_64;
+    const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
     *rate = (n * r_q10 + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
@@ -426,6 +425,7 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       break;
     default:
       assert(0 && "Invalid transform size.");
+      break;
   }
 }
 
@@ -501,10 +501,6 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
   dst[1].buf = src->u_buffer;
   dst[2].buf = src->v_buffer;
   dst[1].stride = dst[2].stride = src->uv_stride;
-#if CONFIG_ALPHA
-  dst[3].buf = src->alpha_buffer;
-  dst[3].stride = src->alpha_stride;
-#endif
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index d3b32a15b..cd8e41e73 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -173,15 +173,27 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int64_t dist_sum = 0;
   const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
+  const int shift = 8;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
 
-    (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
-                              pd->dst.buf, pd->dst.stride, &sse);
+    const unsigned int var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                                                pd->dst.buf, pd->dst.stride,
+                                                &sse);
 
+    if (!x->select_tx_size) {
+      if (sse < p->quant_thred[0] >> shift)
+        x->skip_txfm[i] = 1;
+      else if (var < p->quant_thred[1] >> shift)
+        x->skip_txfm[i] = 2;
+      else
+        x->skip_txfm[i] = 0;
+    }
+
+    x->bsse[i] = sse;
     if (i == 0)
       x->pred_sse[ref] = sse;
 
@@ -433,19 +445,51 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   if (args->skip)
     return;
 
-  if (!is_inter_block(mbmi))
+  if (!is_inter_block(mbmi)) {
     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
-  else
-    vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
 #if CONFIG_VP9_HIGH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
-    dist_block(plane, block, tx_size, args, xd->bps);
-  } else {
-    dist_block(plane, block, tx_size, args, 8);
-  }
+    if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
+      dist_block(plane, block, tx_size, args, xd->bps);
+    } else {
+      dist_block(plane, block, tx_size, args, 8);
+    }
 #else
-  dist_block(plane, block, tx_size, args);
+    dist_block(plane, block, tx_size, args);
 #endif
+  } else {
+    if (x->skip_txfm[plane] == 0) {
+      // full forward transform and quantization
+      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+#if CONFIG_VP9_HIGH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
+        dist_block(plane, block, tx_size, args, xd->bps);
+      } else {
+        dist_block(plane, block, tx_size, args, 8);
+      }
+#else
+      dist_block(plane, block, tx_size, args);
+#endif
+    } else if (x->skip_txfm[plane] == 2) {
+      // compute DC coefficient
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+      args->sse  = x->bsse[plane] << 4;
+      args->dist = args->sse;
+      if (!x->plane[plane].eobs[block])
+        args->dist = args->sse - ((coeff[0] * coeff[0] -
+            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0])) >> 2);
+    } else {
+      // skip forward transform
+      x->plane[plane].eobs[block] = 0;
+      args->sse  = x->bsse[plane] << 4;
+      args->dist = args->sse;
+    }
+  }
+
+
+
+
   rate_block(plane, block, plane_bsize, tx_size, args);
   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
@@ -517,22 +561,26 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(x, rate, distortion, skip,
-                   &sse[mbmi->tx_size], ref_best_rd, 0, bs,
+                   sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
   cpi->tx_stepdown_count[0]++;
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
-                                   int (*r)[2], int *rate,
-                                   int64_t *d, int64_t *distortion,
-                                   int *s, int *skip,
+                                   int *rate,
+                                   int64_t *distortion,
+                                   int *skip,
+                                   int64_t *psse,
                                    int64_t tx_cache[TX_MODES],
+                                   int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
+  int r[TX_SIZES][2], s[TX_SIZES];
+  int64_t d[TX_SIZES], sse[TX_SIZES];
   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
                              {INT64_MAX, INT64_MAX},
@@ -549,6 +597,9 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n],
+                     &sse[n], ref_best_rd, 0, bs, n,
+                     cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
       for (m = 0; m <= n - (n == max_tx_size); m++) {
@@ -579,6 +630,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->tx_size];
+  *psse       = sse[mbmi->tx_size];
 
   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
@@ -605,65 +657,39 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                   int64_t *psse, BLOCK_SIZE bs,
                                   int64_t txfm_cache[TX_MODES],
                                   int64_t ref_best_rd) {
-  int r[TX_SIZES][2], s[TX_SIZES];
-  int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  TX_SIZE tx_size;
 
-  assert(bs == mbmi->sb_type);
+  assert(bs == xd->mi[0]->mbmi.sb_type);
 
   vp9_subtract_plane(x, bs, 0);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, psse, ref_best_rd,
                            bs);
-    if (psse)
-      *psse = sse[mbmi->tx_size];
-    return;
+  } else {
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, psse,
+                           txfm_cache, ref_best_rd, bs);
   }
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size],
-                     &sse[tx_size], ref_best_rd, 0, bs, tx_size,
-                     cpi->sf.use_fast_coef_costing);
-  choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s,
-                         skip, txfm_cache, bs);
-
-  if (psse)
-    *psse = sse[mbmi->tx_size];
 }
 
 static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                   int64_t *distortion, int *skip,
-                                  int64_t *psse, BLOCK_SIZE bs,
+                                  BLOCK_SIZE bs,
                                   int64_t txfm_cache[TX_MODES],
                                   int64_t ref_best_rd) {
-  int64_t sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t sse;
 
-  assert(bs == mbmi->sb_type);
+  assert(bs == xd->mi[0]->mbmi.sb_type);
   if (cpi->sf.tx_size_search_method != USE_FULL_RD || xd->lossless) {
     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
-    choose_largest_tx_size(cpi, x, rate, distortion, skip, sse, ref_best_rd,
+    choose_largest_tx_size(cpi, x, rate, distortion, skip, &sse, ref_best_rd,
                            bs);
   } else {
-    int r[TX_SIZES][2], s[TX_SIZES];
-    int64_t d[TX_SIZES];
-    TX_SIZE tx_size;
-    for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
-      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
-                       &s[tx_size], &sse[tx_size],
-                       ref_best_rd, 0, bs, tx_size,
-                       cpi->sf.use_fast_coef_costing);
-    choose_tx_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
-                           bs);
+    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, &sse,
+                           txfm_cache, ref_best_rd, bs);
   }
-  if (psse)
-    *psse = sse[mbmi->tx_size];
 }
 
 
@@ -1023,7 +1049,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     mic->mbmi.mode = mode;
 
     intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, NULL, bsize, local_tx_cache, best_rd);
+        &s, bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -2275,6 +2301,100 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
   }
 }
 
+static void rd_encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int *rate2,
+                                    int64_t *distortion, int64_t *distortion_uv,
+                                    int *disable_skip) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+  const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+  unsigned int var, sse;
+  // Skipping threshold for ac.
+  unsigned int thresh_ac;
+  // Skipping threshold for dc
+  unsigned int thresh_dc;
+
+  var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                               xd->plane[0].dst.buf,
+                               xd->plane[0].dst.stride, &sse);
+
+  if (x->encode_breakout > 0) {
+    // Set a maximum for threshold to avoid big PSNR loss in low bitrate
+    // case. Use extreme low threshold for static frames to limit skipping.
+    const unsigned int max_thresh = (cpi->allow_encode_breakout ==
+                                     ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
+    // The encode_breakout input
+    const unsigned int min_thresh =
+        MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+
+    // Calculate threshold according to dequant value.
+    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+#if CONFIG_VP9_HIGH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
+      const int shift = 2 * xd->bps - 16;
+      if (shift > 0)
+        thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+    }
+#endif
+    thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+    // Adjust threshold according to partition size.
+    thresh_ac >>= 8 - (b_width_log2(bsize) +
+        b_height_log2(bsize));
+    thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
+          const int shift = 2 * xd->bps - 16;
+          if (shift > 0)
+            thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+        }
+#endif
+  } else {
+    thresh_ac = 0;
+    thresh_dc = 0;
+  }
+
+  // Y skipping condition checking
+  if (sse < thresh_ac || sse == 0) {
+    // dc skipping checking
+    if ((sse - var) < thresh_dc || sse == var) {
+      unsigned int sse_u, sse_v;
+      unsigned int var_u, var_v;
+
+      var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                      x->plane[1].src.stride,
+                                      xd->plane[1].dst.buf,
+                                      xd->plane[1].dst.stride, &sse_u);
+
+      // U skipping condition checking
+      if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
+          (sse_u - var_u < thresh_dc || sse_u == var_u)) {
+        var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                        x->plane[2].src.stride,
+                                        xd->plane[2].dst.buf,
+                                        xd->plane[2].dst.stride, &sse_v);
+
+        // V skipping condition checking
+        if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
+            (sse_v - var_v < thresh_dc || sse_v == var_v)) {
+          x->skip = 1;
+
+          // The cost of skip bit needs to be added.
+          *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+          // Scaling factor for SSE from spatial domain to frequency domain
+          // is 16. Adjust distortion accordingly.
+          *distortion_uv = (sse_u + sse_v) << 4;
+          *distortion = (sse << 4) + *distortion_uv;
+
+          *disable_skip = 1;
+        }
+      }
+    }
+  }
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
@@ -2282,8 +2402,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  int *skippable,
                                  int *rate_y, int64_t *distortion_y,
                                  int *rate_uv, int64_t *distortion_uv,
-                                 int *mode_excluded, int *disable_skip,
-                                 INTERP_FILTER *best_filter,
+                                 int *disable_skip,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
                                  int_mv single_newmv[MAX_REF_FRAMES],
@@ -2294,7 +2413,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_comp_pred = has_second_ref(mbmi);
-  const int num_refs = is_comp_pred ? 2 : 1;
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
   int i;
@@ -2316,6 +2434,15 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
+  INTERP_FILTER best_filter = SWITCHABLE;
+  int skip_txfm[MAX_MB_PLANE] = {0};
+  int64_t bsse[MAX_MB_PLANE] = {0};
+
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search = cpi->sf.cb_pred_filter_search ?
+      (((mi_row + mi_col) >> bsl) +
+       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+
 #if CONFIG_VP9_HIGH
   if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
@@ -2324,6 +2451,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif
 
+  if (pred_filter_search) {
+    INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available)
+      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+    if (xd->left_available)
+      lf = xd->mi[-1]->mbmi.interp_filter;
+
+    if ((this_mode != NEWMV) || (af == lf))
+      best_filter = af;
+  }
+
   if (is_comp_pred) {
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
@@ -2362,7 +2500,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  for (i = 0; i < num_refs; ++i) {
+  for (i = 0; i < is_comp_pred + 1; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
     if (this_mode != NEWMV)
@@ -2389,10 +2527,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
    * if the first is known */
   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
 
-  if (!(*mode_excluded))
-    *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
-                                  : cm->reference_mode == COMPOUND_REFERENCE;
-
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
@@ -2406,10 +2540,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     rd_opt->filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
-    *best_filter = EIGHTTAP;
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-      *best_filter = EIGHTTAP;
-    } else {
+      best_filter = EIGHTTAP;
+    } else if (best_filter == SWITCHABLE) {
       int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
@@ -2471,9 +2604,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
         if (newbest) {
           best_rd = rd;
-          *best_filter = mbmi->interp_filter;
+          best_filter = mbmi->interp_filter;
           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
             best_needs_copy = !best_needs_copy;
+          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
         }
 
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
@@ -2487,7 +2622,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
-      cm->interp_filter : *best_filter;
+      cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
 
   if (pred_exists) {
@@ -2521,97 +2656,14 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *rate2 += vp9_get_switchable_rate(cpi);
 
   if (!is_comp_pred) {
-    if (cpi->allow_encode_breakout) {
-      const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-      const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-      unsigned int var, sse;
-      // Skipping threshold for ac.
-      unsigned int thresh_ac;
-      // Skipping threshold for dc
-      unsigned int thresh_dc;
-
-      var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                   xd->plane[0].dst.buf,
-                                   xd->plane[0].dst.stride, &sse);
-
-      if (x->encode_breakout > 0) {
-        // Set a maximum for threshold to avoid big PSNR loss in low bitrate
-        // case. Use extreme low threshold for static frames to limit skipping.
-        const unsigned int max_thresh = (cpi->allow_encode_breakout ==
-                                        ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
-        // The encode_breakout input
-        const unsigned int min_thresh =
-            MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
-
-        // Calculate threshold according to dequant value.
-        thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
-#if CONFIG_VP9_HIGH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
-          const int shift = 2 * xd->bps - 16;
-          if (shift > 0)
-            thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
-        }
-#endif
-        thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
-
-        // Adjust threshold according to partition size.
-        thresh_ac >>= 8 - (b_width_log2(bsize) +
-            b_height_log2(bsize));
-        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
-#if CONFIG_VP9_HIGH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGH) {
-          const int shift = 2 * xd->bps - 16;
-          if (shift > 0)
-            thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
-        }
-#endif
-      } else {
-        thresh_ac = 0;
-        thresh_dc = 0;
-      }
-
-      // Y skipping condition checking
-      if (sse < thresh_ac || sse == 0) {
-        // dc skipping checking
-        if ((sse - var) < thresh_dc || sse == var) {
-          unsigned int sse_u, sse_v;
-          unsigned int var_u, var_v;
-
-          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                          x->plane[1].src.stride,
-                                          xd->plane[1].dst.buf,
-                                          xd->plane[1].dst.stride, &sse_u);
-
-          // U skipping condition checking
-          if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
-              (sse_u - var_u < thresh_dc || sse_u == var_u)) {
-            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                            x->plane[2].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[2].dst.stride, &sse_v);
-
-            // V skipping condition checking
-            if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
-                (sse_v - var_v < thresh_dc || sse_v == var_v)) {
-              x->skip = 1;
-
-              // The cost of skip bit needs to be added.
-              *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
-
-              // Scaling factor for SSE from spatial domain to frequency domain
-              // is 16. Adjust distortion accordingly.
-              *distortion_uv = (sse_u + sse_v) << 4;
-              *distortion = (sse << 4) + *distortion_uv;
-
-              *disable_skip = 1;
-              this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-            }
-          }
-        }
-      }
-    }
+    if (cpi->allow_encode_breakout)
+      rd_encode_breakout_test(cpi, x, bsize, rate2, distortion, distortion_uv,
+                              disable_skip);
   }
 
+  vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  vpx_memcpy(x->bsse, bsse, sizeof(bsse));
+
   if (!x->skip) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
@@ -2776,7 +2828,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_inter_rd = INT64_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
-  INTERP_FILTER tmp_best_filter = SWITCHABLE;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
@@ -2818,9 +2869,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+      setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2926,6 +2976,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         case NONE:
         case MAX_REF_FRAMES:
           assert(0 && "Invalid Reference frame");
+          break;
       }
     }
     if (mode_skip_mask & (1 << mode_index))
@@ -2942,6 +2993,55 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
+    if (cpi->sf.motion_field_mode_search) {
+      const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
+                                tile->mi_col_end - mi_col);
+      const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
+                                tile->mi_row_end - mi_row);
+      const int bsl = mi_width_log2(bsize);
+      int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
+          + get_chessboard_index(cm->current_video_frame)) & 0x1;
+      MB_MODE_INFO *ref_mbmi;
+      int const_motion = 1;
+      int skip_ref_frame = !cb_partition_search_ctrl;
+      MV_REFERENCE_FRAME rf = NONE;
+      int_mv ref_mv;
+      ref_mv.as_int = INVALID_MV;
+
+      if ((mi_row - 1) >= tile->mi_row_start) {
+        ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
+        rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
+        for (i = 0; i < mi_width; ++i) {
+          ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
+          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+                          (ref_frame == ref_mbmi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+        }
+      }
+
+      if ((mi_col - 1) >= tile->mi_col_start) {
+        if (ref_mv.as_int == INVALID_MV)
+          ref_mv = xd->mi[-1]->mbmi.mv[0];
+        if (rf == NONE)
+          rf = xd->mi[-1]->mbmi.ref_frame[0];
+        for (i = 0; i < mi_height; ++i) {
+          ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
+          const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
+                          (ref_frame == ref_mbmi->ref_frame[0]);
+          skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
+        }
+      }
+
+      if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
+        if (rf > INTRA_FRAME)
+          if (ref_frame != rf)
+            continue;
+
+      if (const_motion)
+        if (this_mode == NEARMV || this_mode == ZEROMV)
+          continue;
+    }
+
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
@@ -3012,7 +3112,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
-      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
+      intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                             bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
@@ -3041,8 +3141,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   &rate2, &distortion2, &skippable,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  &tmp_best_filter, frame_mv,
+                                  &disable_skip, frame_mv,
                                   mi_row, mi_col,
                                   single_newmv, &total_sse, best_rd);
       if (this_rd == INT64_MAX)
@@ -3322,7 +3421,6 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
   RD_OPT *const rd_opt = &cpi->rd;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const struct segmentation *const seg = &cm->seg;
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
@@ -3348,7 +3446,7 @@ int64_t vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
 
   *returnrate = INT_MAX;
 
-  assert(vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP));
+  assert(vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
   mbmi->mode = ZEROMV;
   mbmi->uv_mode = DC_PRED;
@@ -3532,6 +3630,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           case NONE:
           case MAX_REF_FRAMES:
             assert(0 && "Invalid Reference frame");
+            break;
         }
       }
       if (mode_skip_mask & (1 << ref_index))
@@ -3576,13 +3675,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
       continue;
 
-    if (comp_pred) {
-      mode_excluded = mode_excluded ? mode_excluded
-                                    : cm->reference_mode == SINGLE_REFERENCE;
-    } else if (ref_frame != INTRA_FRAME) {
-      mode_excluded = mode_excluded ? mode_excluded
-                                    : cm->reference_mode == COMPOUND_REFERENCE;
-    }
+    if (comp_pred)
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+    else if (ref_frame != INTRA_FRAME)
+      mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 897ae0129..d5676c3d1 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -110,14 +110,12 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
   return cost;
 }
 
-static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
-                       MODE_INFO **mi,
+static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MODE_INFO **mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
                        int bw, int bh, int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   int segment_id;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
@@ -151,14 +149,13 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                          MODE_INFO **mi,
+static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MODE_INFO **mi,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
                           int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mi_stride;
   int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
@@ -170,18 +167,18 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
   bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, bs, mi_row, mi_col);
   } else if (bw == bs && bh < bs) {
-    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
-    count_segs(cpi, tile, mi + hbs * mis, no_pred_segcounts,
+    count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
                temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
                mi_row + hbs, mi_col);
   } else if (bw < bs && bh == bs) {
-    count_segs(cpi, tile, mi, no_pred_segcounts, temporal_predictor_count,
+    count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
                t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
-    count_segs(cpi, tile, mi + hbs,
+    count_segs(cm, xd, tile, mi + hbs,
                no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
                hbs, bs, mi_row, mi_col + hbs);
   } else {
@@ -194,7 +191,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
       const int mi_dc = hbs * (n & 1);
       const int mi_dr = hbs * (n >> 1);
 
-      count_segs_sb(cpi, tile, &mi[mi_dr * mis + mi_dc],
+      count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc],
                     no_pred_segcounts, temporal_predictor_count,
                     t_unpred_seg_counts,
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -202,8 +199,7 @@ static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
 
   int no_pred_cost;
@@ -237,7 +233,7 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
       MODE_INFO **mi = mi_ptr;
       for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
            mi_col += 8, mi += 8)
-        count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
+        count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
                       temporal_predictor_count, t_unpred_seg_counts,
                       mi_row, mi_col, BLOCK_64X64);
     }
diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h
index 50dd562c8..8c6944ad1 100644
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -42,7 +42,7 @@ void vp9_clear_segdata(struct segmentation *seg,
 void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
                           unsigned char abs_delta);
 
-void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd);
 
 void vp9_reset_segment_features(struct segmentation *seg);
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index e3951d532..99f336f2a 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -88,13 +88,13 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
       sf->last_partitioning_redo_frequency = 3;
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
                                               : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
       sf->last_partitioning_redo_frequency = 2;
       sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
     }
 
-    sf->adaptive_pred_interp_filter = 0;
     sf->reference_masking = 1;
     sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
@@ -110,18 +110,23 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
   if (speed >= 3) {
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
-    if (MIN(cm->width, cm->height) >= 720)
+    if (MIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
-    else
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
       sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-
+    }
+    sf->adaptive_pred_interp_filter = 0;
+    sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+    sf->cb_pred_filter_search = 1;
+    sf->motion_field_mode_search = frame_is_boosted(cpi) ? 0 : 1;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->last_partitioning_redo_frequency = 3;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
-    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
-    sf->use_fast_coef_costing = 1;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
   }
 
   if (speed >= 4) {
@@ -134,6 +139,8 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->disable_filter_search_var_thresh = 200;
     sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
     sf->use_lp32x32fdct = 1;
+    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+    sf->use_fast_coef_costing = 1;
   }
 
   if (speed >= 5) {
@@ -155,7 +162,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
 }
 
 static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
-                                 int speed) {
+                                 int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
   const int frames_since_key =
       cm->frame_type == KEY_FRAME ? 0 : cpi->rc.frames_since_key;
@@ -176,6 +183,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
 
     sf->use_rd_breakout = 1;
+
     sf->adaptive_motion_search = 1;
     sf->adaptive_pred_interp_filter = 1;
     sf->mv.auto_mv_step_size = 1;
@@ -270,13 +278,19 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 6) {
+    if (content == VP9E_CONTENT_SCREEN) {
+      int i;
+      // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
+      for (i = 0; i < BLOCK_SIZES; ++i)
+        sf->inter_mode_mask[i] = INTER_ALL;
+    }
+
     // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
     sf->partition_search_type = SOURCE_VAR_BASED_PARTITION;
     sf->search_type_check_frequency = 50;
 
     sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ?
         USE_LARGESTALL : USE_TX_8X8;
-    sf->max_intra_bsize = BLOCK_8X8;
 
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -330,6 +344,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->adaptive_pred_interp_filter = 0;
+  sf->cb_pred_filter_search = 0;
+  sf->cb_partition_search = 0;
+  sf->motion_field_mode_search = 0;
   sf->use_quant_fp = 0;
   sf->reference_masking = 0;
   sf->partition_search_type = SEARCH_PARTITION;
@@ -385,17 +402,17 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       set_good_speed_feature(cpi, cm, sf, oxcf->speed);
       break;
     case REALTIME:
-      set_rt_speed_feature(cpi, sf, oxcf->speed);
+      set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
       break;
   }
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
-  if (cpi->pass == 1)
+  if (oxcf->pass == 1)
     sf->optimize_coefficients = 0;
 
   // No recode for 1 pass.
-  if (cpi->pass == 0) {
+  if (oxcf->pass == 0) {
     sf->recode_loop = DISALLOW_RECODE;
     sf->optimize_coefficients = 0;
   }
@@ -404,7 +421,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
   }
 
-  cpi->mb.optimize = sf->optimize_coefficients == 1 && cpi->pass != 1;
+  cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
 
   if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
     sf->adaptive_pred_interp_filter = 0;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index bdbbe5888..243139d7b 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -283,6 +283,13 @@ typedef struct SPEED_FEATURES {
   // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
   int adaptive_pred_interp_filter;
 
+  // Chessboard pattern prediction filter type search
+  int cb_pred_filter_search;
+
+  int cb_partition_search;
+
+  int motion_field_mode_search;
+
   // Fast quantization process path
   int use_quant_fp;
 
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 9796d6476..2f1c72992 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -16,7 +16,24 @@
 
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
-static int update_bits[255];
+static const int update_bits[255] = {
+  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
+  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  0,
+};
 
 static int recenter_nonneg(int v, int m) {
   if (v > (m << 1))
@@ -61,18 +78,6 @@ static int remap_prob(int v, int m) {
   return i;
 }
 
-static int count_term_subexp(int word) {
-  if (word < 16)
-    return 5;
-  if (word < 32)
-    return 6;
-  if (word < 64)
-    return 8;
-  if (word < 129)
-    return 10;
-  return 11;
-}
-
 static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
   int delp = remap_prob(newp, oldp);
   return update_bits[delp] * 256;
@@ -111,12 +116,6 @@ void vp9_write_prob_diff_update(vp9_writer *w, vp9_prob newp, vp9_prob oldp) {
   encode_term_subexp(w, delp);
 }
 
-void vp9_compute_update_table() {
-  int i;
-  for (i = 0; i < 254; i++)
-    update_bits[i] = count_term_subexp(i);
-}
-
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
                                         vp9_prob oldp, vp9_prob *bestp,
                                         vp9_prob upd) {
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 8e9c0c62a..8e02a1d0d 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -16,9 +16,6 @@
 extern "C" {
 #endif
 
-void vp9_compute_update_table();
-
-
 void vp9_write_prob_diff_update(vp9_writer *w,
                                 vp9_prob newp, vp9_prob oldp);
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 1eb450928..52f6cda53 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -35,6 +35,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     RATE_CONTROL *const lrc = &lc->rc;
     int i;
     lc->current_video_frame_in_layer = 0;
+    lc->layer_size = 0;
     lrc->ni_av_qi = oxcf->worst_allowed_q;
     lrc->total_actual_bits = 0;
     lrc->total_target_vs_actual = 0;
@@ -48,7 +49,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
     for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
       lrc->rate_correction_factors[i] = 1.0;
     }
-    lc->layer_size = 0;
 
     if (svc->number_temporal_layers > 1) {
       lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
@@ -66,12 +66,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         lc->alt_ref_idx = alt_ref_idx++;
       else
         lc->alt_ref_idx = -1;
+      lc->gold_ref_idx = -1;
     }
 
     lrc->buffer_level = vp9_rescale((int)(oxcf->starting_buffer_level_ms),
                                     lc->target_bandwidth, 1000);
     lrc->bits_off_target = lrc->buffer_level;
   }
+
+  // Still have extra buffer for base layer golden frame
+  if (svc->number_spatial_layers > 1 && alt_ref_idx < REF_FRAMES)
+    svc->layer_context[0].gold_ref_idx = alt_ref_idx;
 }
 
 // Update the layer context from a change_config() call.
@@ -111,9 +116,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
     lrc->buffer_level = MIN(lrc->buffer_level, lrc->maximum_buffer_size);
     // Update framerate-related quantities.
     if (svc->number_temporal_layers > 1) {
-      lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+      lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
     } else {
-      lc->framerate = oxcf->framerate;
+      lc->framerate = cpi->framerate;
     }
     lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
     lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -136,7 +141,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   RATE_CONTROL *const lrc = &lc->rc;
   const int layer = svc->temporal_layer_id;
 
-  lc->framerate = oxcf->framerate / oxcf->ts_rate_decimator[layer];
+  lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
   lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
@@ -144,7 +149,7 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
     lc->avg_frame_size = lrc->avg_frame_bandwidth;
   } else {
     const double prev_layer_framerate =
-        oxcf->framerate / oxcf->ts_rate_decimator[layer - 1];
+        cpi->framerate / oxcf->ts_rate_decimator[layer - 1];
     const int prev_layer_target_bandwidth = oxcf->ts_target_bitrate[layer - 1];
     lc->avg_frame_size =
         (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
@@ -217,12 +222,12 @@ void vp9_inc_frame_in_layer(SVC *svc) {
 }
 
 int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
-  return cpi->use_svc &&
-         cpi->svc.number_temporal_layers == 1 &&
+  return is_spatial_svc(cpi) &&
          cpi->svc.spatial_layer_id > 0 &&
          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame;
 }
 
+#if CONFIG_SPATIAL_SVC
 int vp9_svc_lookahead_push(const VP9_COMP *const cpi, struct lookahead_ctx *ctx,
                            YV12_BUFFER_CONFIG *src, int64_t ts_start,
                            int64_t ts_end, unsigned int flags) {
@@ -265,21 +270,25 @@ static int copy_svc_params(VP9_COMP *const cpi, struct lookahead_entry *buf) {
   layer_param = &buf->svc_params[layer_id];
   cpi->svc.spatial_layer_id = layer_param->spatial_layer;
   cpi->svc.temporal_layer_id = layer_param->temporal_layer;
+  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
 
   cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
 
   if (cpi->svc.spatial_layer_id < 1)
-    cpi->gld_fb_idx = cpi->lst_fb_idx;
+      cpi->gld_fb_idx = lc->gold_ref_idx >= 0 ?
+                        lc->gold_ref_idx : cpi->lst_fb_idx;
   else
     cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
 
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
   if (lc->current_video_frame_in_layer == 0) {
-    if (cpi->svc.spatial_layer_id >= 2)
+    if (cpi->svc.spatial_layer_id >= 2) {
       cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-    else
+    } else {
       cpi->alt_fb_idx = cpi->lst_fb_idx;
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
+    }
   } else {
     if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
       cpi->alt_fb_idx = lc->alt_ref_idx;
@@ -352,3 +361,4 @@ struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
 
   return buf;
 }
+#endif
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 7b533e467..801449b6f 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -31,6 +31,7 @@ typedef struct {
   vpx_svc_parameters_t svc_params_received;
   struct lookahead_entry  *alt_ref_source;
   int alt_ref_idx;
+  int gold_ref_idx;
   int has_alt_frame;
   size_t layer_size;
 } LAYER_CONTEXT;
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 7607307a4..cfe5a39d6 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -634,7 +634,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi,
   }
 
   // Adjustments for second level arf in multi arf case.
-  if (cpi->pass == 2 && cpi->multi_arf_allowed) {
+  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
       cpi->active_arnr_strength >>= 1;
@@ -671,7 +671,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   }
 
   // Setup scaling factors. Scaling on each of the arnr frames is not supported
-  if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+  if (is_spatial_svc(cpi)) {
     // In spatial svc the scaling factors might be less then 1/2. So we will use
     // non-normative scaling.
     int frame_used = 0;
diff --git a/vp9/encoder/vp9_write_bit_buffer.c b/vp9/encoder/vp9_write_bit_buffer.c
index 962d0ca56..6d55e84e8 100644
--- a/vp9/encoder/vp9_write_bit_buffer.c
+++ b/vp9/encoder/vp9_write_bit_buffer.c
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
-size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb) {
+size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb) {
   return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
 }
 
diff --git a/vp9/encoder/vp9_write_bit_buffer.h b/vp9/encoder/vp9_write_bit_buffer.h
index 073608d7f..59f9bbe30 100644
--- a/vp9/encoder/vp9_write_bit_buffer.h
+++ b/vp9/encoder/vp9_write_bit_buffer.h
@@ -11,8 +11,6 @@
 #ifndef VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
 #define VP9_ENCODER_VP9_WRITE_BIT_BUFFER_H_
 
-#include <limits.h>
-
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -24,7 +22,7 @@ struct vp9_write_bit_buffer {
   size_t bit_offset;
 };
 
-size_t vp9_rb_bytes_written(struct vp9_write_bit_buffer *wb);
+size_t vp9_wb_bytes_written(const struct vp9_write_bit_buffer *wb);
 
 void vp9_wb_write_bit(struct vp9_write_bit_buffer *wb, int bit);
 
diff --git a/vp9/encoder/vp9_writer.c b/vp9/encoder/vp9_writer.c
index 8398fc07a..ff461f218 100644
--- a/vp9/encoder/vp9_writer.c
+++ b/vp9/encoder/vp9_writer.c
@@ -15,7 +15,6 @@
 void vp9_start_encode(vp9_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range    = 255;
-  br->value    = 0;
   br->count    = -24;
   br->buffer   = source;
   br->pos      = 0;
diff --git a/vp9/encoder/vp9_writer.h b/vp9/encoder/vp9_writer.h
index 7f4fa1ef2..9d161f95c 100644
--- a/vp9/encoder/vp9_writer.h
+++ b/vp9/encoder/vp9_writer.h
@@ -22,20 +22,15 @@ extern "C" {
 typedef struct {
   unsigned int lowvalue;
   unsigned int range;
-  unsigned int value;
   int count;
   unsigned int pos;
   uint8_t *buffer;
-
-  // Variables used to track bit costs without outputing to the bitstream
-  unsigned int  measure_cost;
-  uint64_t bit_counter;
 } vp9_writer;
 
 void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
 void vp9_stop_encode(vp9_writer *bc);
 
-static void vp9_write(vp9_writer *br, int bit, int probability) {
+static INLINE void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
@@ -83,11 +78,11 @@ static void vp9_write(vp9_writer *br, int bit, int probability) {
   br->range = range;
 }
 
-static void vp9_write_bit(vp9_writer *w, int bit) {
+static INLINE void vp9_write_bit(vp9_writer *w, int bit) {
   vp9_write(w, bit, 128);  // vp9_prob_half
 }
 
-static void vp9_write_literal(vp9_writer *w, int data, int bits) {
+static INLINE void vp9_write_literal(vp9_writer *w, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index b5269ed03..3a19f5274 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -12,2572 +12,6 @@
 #include "vp9/common/vp9_idct.h"  // for cospi constants
 #include "vpx_ports/mem.h"
 
-void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in0, in1, in2, in3;
-  // Load inputs.
-  {
-    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
-    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
-    // x = x << 4
-    in0 = _mm_slli_epi16(in0, 4);
-    in1 = _mm_slli_epi16(in1, 4);
-    in2 = _mm_slli_epi16(in2, 4);
-    in3 = _mm_slli_epi16(in3, 4);
-    // if (i == 0 && input[0]) input[0] += 1;
-    {
-      // The mask will only contain whether the first value is zero, all
-      // other comparison will fail as something shifted by 4 (above << 4)
-      // can never be equal to one. To increment in the non-zero case, we
-      // add the mask and one for the first element:
-      //   - if zero, mask = -1, v = v - 1 + 1 = v
-      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-      in0 = _mm_add_epi16(in0, mask);
-      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-    }
-  }
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // Transform 1/2: Add/subtract
-    const __m128i r0 = _mm_add_epi16(in0, in3);
-    const __m128i r1 = _mm_add_epi16(in1, in2);
-    const __m128i r2 = _mm_sub_epi16(in1, in2);
-    const __m128i r3 = _mm_sub_epi16(in0, in3);
-    // Transform 1/2: Interleave to do the multiply by constants which gets us
-    //                into 32 bits.
-    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-    // Combine and transpose
-    const __m128i res0 = _mm_packs_epi32(w0, w2);
-    const __m128i res1 = _mm_packs_epi32(w4, w6);
-    // 00 01 02 03 20 21 22 23
-    // 10 11 12 13 30 31 32 33
-    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
-    // 00 10 01 11 02 12 03 13
-    // 20 30 21 31 22 32 23 33
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
-    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
-    if (0 == pass) {
-      // Extract values in the high part for second pass as transform code
-      // only uses the first four values.
-      in1 = _mm_unpackhi_epi64(in0, in0);
-      in3 = _mm_unpackhi_epi64(in2, in2);
-    } else {
-      // Post-condition output and store it (v + 1) >> 2, taking advantage
-      // of the fact 1/3 are stored just after 0/2.
-      __m128i out01 = _mm_add_epi16(in0, kOne);
-      __m128i out23 = _mm_add_epi16(in2, kOne);
-      out01 = _mm_srai_epi16(out01, 2);
-      out23 = _mm_srai_epi16(out23, 2);
-      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
-      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
-    }
-  }
-}
-
-static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
-                                   int stride) {
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i mask;
-
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-
-  in[0] = _mm_slli_epi16(in[0], 4);
-  in[1] = _mm_slli_epi16(in[1], 4);
-  in[2] = _mm_slli_epi16(in[2], 4);
-  in[3] = _mm_slli_epi16(in[3], 4);
-
-  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
-  in[0] = _mm_add_epi16(in[0], mask);
-  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
-  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
-  __m128i out01 = _mm_add_epi16(in01, kOne);
-  __m128i out23 = _mm_add_epi16(in23, kOne);
-  out01 = _mm_srai_epi16(out01, 2);
-  out23 = _mm_srai_epi16(out23, 2);
-  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
-  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
-}
-
-static INLINE void transpose_4x4_avx2(__m128i *res) {
-  // Combine and transpose
-  // 00 01 02 03 20 21 22 23
-  // 10 11 12 13 30 31 32 33
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
-  // 00 10 20 30 01 11 21 31
-  // 02 12 22 32 03 13 23 33
-  // only use the first 4 16-bit integers
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-void fdct4_avx2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u[4], v[4];
-  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
-  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
-
-  v[0] = _mm_add_epi16(u[0], u[1]);
-  v[1] = _mm_sub_epi16(u[0], u[1]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
-  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
-  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-  transpose_4x4_avx2(in);
-}
-
-void fadst4_avx2(__m128i *in) {
-  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
-  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
-  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-  __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[2], kZero);
-  u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
-  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_sub_epi32(v[2], v[6]);
-  u[2] = _mm_add_epi32(v[3], v[4]);
-  u[3] = _mm_sub_epi32(u[2], u[0]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_sub_epi32(u[4], v[5]);
-  u[6] = _mm_add_epi32(u[3], u[5]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  transpose_4x4_avx2(in);
-}
-
-void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
-                     int stride, int tx_type) {
-  __m128i in[4];
-
-  switch (tx_type) {
-    case DCT_DCT:
-      vp9_fdct4x4_avx2(input, output, stride);
-      break;
-    case ADST_DCT:
-      load_buffer_4x4_avx2(input, in, stride);
-      fadst4_avx2(in);
-      fdct4_avx2(in);
-      write_buffer_4x4_avx2(output, in);
-      break;
-    case DCT_ADST:
-      load_buffer_4x4_avx2(input, in, stride);
-      fdct4_avx2(in);
-      fadst4_avx2(in);
-      write_buffer_4x4_avx2(output, in);
-      break;
-    case ADST_ADST:
-      load_buffer_4x4_avx2(input, in, stride);
-      fadst4_avx2(in);
-      fadst4_avx2(in);
-      write_buffer_4x4_avx2(output, in);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
-    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
-    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
-    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
-    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
-    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
-    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
-    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
-  }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  in[0] = _mm_slli_epi16(in[0], 2);
-  in[1] = _mm_slli_epi16(in[1], 2);
-  in[2] = _mm_slli_epi16(in[2], 2);
-  in[3] = _mm_slli_epi16(in[3], 2);
-  in[4] = _mm_slli_epi16(in[4], 2);
-  in[5] = _mm_slli_epi16(in[5], 2);
-  in[6] = _mm_slli_epi16(in[6], 2);
-  in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  const int bit_m02 = bit - 2;
-  __m128i sign0 = _mm_srai_epi16(res[0], 15);
-  __m128i sign1 = _mm_srai_epi16(res[1], 15);
-  __m128i sign2 = _mm_srai_epi16(res[2], 15);
-  __m128i sign3 = _mm_srai_epi16(res[3], 15);
-  __m128i sign4 = _mm_srai_epi16(res[4], 15);
-  __m128i sign5 = _mm_srai_epi16(res[5], 15);
-  __m128i sign6 = _mm_srai_epi16(res[6], 15);
-  __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
-  if (bit_m02 >= 0) {
-    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
-    res[0] = _mm_add_epi16(res[0], k_const_rounding);
-    res[1] = _mm_add_epi16(res[1], k_const_rounding);
-    res[2] = _mm_add_epi16(res[2], k_const_rounding);
-    res[3] = _mm_add_epi16(res[3], k_const_rounding);
-    res[4] = _mm_add_epi16(res[4], k_const_rounding);
-    res[5] = _mm_add_epi16(res[5], k_const_rounding);
-    res[6] = _mm_add_epi16(res[6], k_const_rounding);
-    res[7] = _mm_add_epi16(res[7], k_const_rounding);
-  }
-
-  res[0] = _mm_sub_epi16(res[0], sign0);
-  res[1] = _mm_sub_epi16(res[1], sign1);
-  res[2] = _mm_sub_epi16(res[2], sign2);
-  res[3] = _mm_sub_epi16(res[3], sign3);
-  res[4] = _mm_sub_epi16(res[4], sign4);
-  res[5] = _mm_sub_epi16(res[5], sign5);
-  res[6] = _mm_sub_epi16(res[6], sign6);
-  res[7] = _mm_sub_epi16(res[7], sign7);
-
-  res[0] = _mm_srai_epi16(res[0], bit);
-  res[1] = _mm_srai_epi16(res[1], bit);
-  res[2] = _mm_srai_epi16(res[2], bit);
-  res[3] = _mm_srai_epi16(res[3], bit);
-  res[4] = _mm_srai_epi16(res[4], bit);
-  res[5] = _mm_srai_epi16(res[5], bit);
-  res[6] = _mm_srai_epi16(res[6], bit);
-  res[7] = _mm_srai_epi16(res[7], bit);
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
-  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
-  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
-  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
-  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
-  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
-  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
-  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
-  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
-void fdct8_avx2(__m128i *in) {
-  // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
-  // stage 1
-  s0 = _mm_add_epi16(in[0], in[7]);
-  s1 = _mm_add_epi16(in[1], in[6]);
-  s2 = _mm_add_epi16(in[2], in[5]);
-  s3 = _mm_add_epi16(in[3], in[4]);
-  s4 = _mm_sub_epi16(in[3], in[4]);
-  s5 = _mm_sub_epi16(in[2], in[5]);
-  s6 = _mm_sub_epi16(in[1], in[6]);
-  s7 = _mm_sub_epi16(in[0], in[7]);
-
-  u0 = _mm_add_epi16(s0, s3);
-  u1 = _mm_add_epi16(s1, s2);
-  u2 = _mm_sub_epi16(s1, s2);
-  u3 = _mm_sub_epi16(s0, s3);
-  // interleave and perform butterfly multiplication/addition
-  v0 = _mm_unpacklo_epi16(u0, u1);
-  v1 = _mm_unpackhi_epi16(u0, u1);
-  v2 = _mm_unpacklo_epi16(u2, u3);
-  v3 = _mm_unpackhi_epi16(u2, u3);
-
-  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
-  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
-  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
-  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
-  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
-  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
-  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
-  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[4] = _mm_packs_epi32(u2, u3);
-  in[6] = _mm_packs_epi32(u6, u7);
-
-  // stage 2
-  // interleave and perform butterfly multiplication/addition
-  u0 = _mm_unpacklo_epi16(s6, s5);
-  u1 = _mm_unpackhi_epi16(s6, s5);
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
-  u0 = _mm_packs_epi32(v0, v1);
-  u1 = _mm_packs_epi32(v2, v3);
-
-  // stage 3
-  s0 = _mm_add_epi16(s4, u0);
-  s1 = _mm_sub_epi16(s4, u0);
-  s2 = _mm_sub_epi16(s7, u1);
-  s3 = _mm_add_epi16(s7, u1);
-
-  // stage 4
-  u0 = _mm_unpacklo_epi16(s0, s3);
-  u1 = _mm_unpackhi_epi16(s0, s3);
-  u2 = _mm_unpacklo_epi16(s1, s2);
-  u3 = _mm_unpackhi_epi16(s1, s2);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
-  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
-  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
-  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
-  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
-  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
-  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
-  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
-  // shift and rounding
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  in[1] = _mm_packs_epi32(v0, v1);
-  in[3] = _mm_packs_epi32(v4, v5);
-  in[5] = _mm_packs_epi32(v2, v3);
-  in[7] = _mm_packs_epi32(v6, v7);
-
-  // transpose
-  array_transpose_8x8_avx2(in, in);
-}
-
-void fadst8_avx2(__m128i *in) {
-  // Constants
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // properly aligned for butterfly input
-  in0  = in[7];
-  in1  = in[0];
-  in2  = in[5];
-  in3  = in[2];
-  in4  = in[3];
-  in5  = in[4];
-  in6  = in[1];
-  in7  = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  // FIXME(jingning): do subtract using bit inversion?
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-
-  // transpose
-  array_transpose_8x8_avx2(in, in);
-}
-
-void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
-                     int stride, int tx_type) {
-  __m128i in[8];
-
-  switch (tx_type) {
-    case DCT_DCT:
-      vp9_fdct8x8_avx2(input, output, stride);
-      break;
-    case ADST_DCT:
-      load_buffer_8x8_avx2(input, in, stride);
-      fadst8_avx2(in);
-      fdct8_avx2(in);
-      right_shift_8x8_avx2(in, 1);
-      write_buffer_8x8_avx2(output, in, 8);
-      break;
-    case DCT_ADST:
-      load_buffer_8x8_avx2(input, in, stride);
-      fdct8_avx2(in);
-      fadst8_avx2(in);
-      right_shift_8x8_avx2(in, 1);
-      write_buffer_8x8_avx2(output, in, 8);
-      break;
-    case ADST_ADST:
-      load_buffer_8x8_avx2(input, in, stride);
-      fadst8_avx2(in);
-      fadst8_avx2(in);
-      right_shift_8x8_avx2(in, 1);
-      write_buffer_8x8_avx2(output, in, 8);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
-  const int16_t *in = input;
-  int16_t *out = intermediate;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = _mm_add_epi16(in00, in15);
-        input1 = _mm_add_epi16(in01, in14);
-        input2 = _mm_add_epi16(in02, in13);
-        input3 = _mm_add_epi16(in03, in12);
-        input4 = _mm_add_epi16(in04, in11);
-        input5 = _mm_add_epi16(in05, in10);
-        input6 = _mm_add_epi16(in06, in09);
-        input7 = _mm_add_epi16(in07, in08);
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = _mm_sub_epi16(in07, in08);
-        step1_1 = _mm_sub_epi16(in06, in09);
-        step1_2 = _mm_sub_epi16(in05, in10);
-        step1_3 = _mm_sub_epi16(in04, in11);
-        step1_4 = _mm_sub_epi16(in03, in12);
-        step1_5 = _mm_sub_epi16(in02, in13);
-        step1_6 = _mm_sub_epi16(in01, in14);
-        step1_7 = _mm_sub_epi16(in00, in15);
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = _mm_add_epi16(input0, input7);
-        const __m128i q1 = _mm_add_epi16(input1, input6);
-        const __m128i q2 = _mm_add_epi16(input2, input5);
-        const __m128i q3 = _mm_add_epi16(input3, input4);
-        const __m128i q4 = _mm_sub_epi16(input3, input4);
-        const __m128i q5 = _mm_sub_epi16(input2, input5);
-        const __m128i q6 = _mm_sub_epi16(input1, input6);
-        const __m128i q7 = _mm_sub_epi16(input0, input7);
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = _mm_add_epi16(q0, q3);
-          const __m128i r1 = _mm_add_epi16(q1, q2);
-          const __m128i r2 = _mm_sub_epi16(q1, q2);
-          const __m128i r3 = _mm_sub_epi16(q0, q3);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res00 = _mm_packs_epi32(w0, w1);
-          res08 = _mm_packs_epi32(w2, w3);
-          res04 = _mm_packs_epi32(w4, w5);
-          res12 = _mm_packs_epi32(w6, w7);
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-          // Combine
-          const __m128i r0 = _mm_packs_epi32(s0, s1);
-          const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/subtract
-          const __m128i x0 = _mm_add_epi16(q4, r0);
-          const __m128i x1 = _mm_sub_epi16(q4, r0);
-          const __m128i x2 = _mm_sub_epi16(q7, r1);
-          const __m128i x3 = _mm_add_epi16(q7, r1);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res02 = _mm_packs_epi32(w0, w1);
-          res14 = _mm_packs_epi32(w2, w3);
-          res10 = _mm_packs_epi32(w4, w5);
-          res06 = _mm_packs_epi32(w6, w7);
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_2 = _mm_packs_epi32(w0, w1);
-          step2_3 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_5 = _mm_packs_epi32(w0, w1);
-          step2_4 = _mm_packs_epi32(w2, w3);
-        }
-        // step 3
-        {
-          step3_0 = _mm_add_epi16(step1_0, step2_3);
-          step3_1 = _mm_add_epi16(step1_1, step2_2);
-          step3_2 = _mm_sub_epi16(step1_1, step2_2);
-          step3_3 = _mm_sub_epi16(step1_0, step2_3);
-          step3_4 = _mm_sub_epi16(step1_7, step2_4);
-          step3_5 = _mm_sub_epi16(step1_6, step2_5);
-          step3_6 = _mm_add_epi16(step1_6, step2_5);
-          step3_7 = _mm_add_epi16(step1_7, step2_4);
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_1 = _mm_packs_epi32(w0, w1);
-          step2_2 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_6 = _mm_packs_epi32(w0, w1);
-          step2_5 = _mm_packs_epi32(w2, w3);
-        }
-        // step 5
-        {
-          step1_0 = _mm_add_epi16(step3_0, step2_1);
-          step1_1 = _mm_sub_epi16(step3_0, step2_1);
-          step1_2 = _mm_sub_epi16(step3_3, step2_2);
-          step1_3 = _mm_add_epi16(step3_3, step2_2);
-          step1_4 = _mm_add_epi16(step3_4, step2_5);
-          step1_5 = _mm_sub_epi16(step3_4, step2_5);
-          step1_6 = _mm_sub_epi16(step3_7, step2_6);
-          step1_7 = _mm_add_epi16(step3_7, step2_6);
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res01 = _mm_packs_epi32(w0, w1);
-          res09 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res05 = _mm_packs_epi32(w0, w1);
-          res13 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res11 = _mm_packs_epi32(w0, w1);
-          res03 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res15 = _mm_packs_epi32(w0, w1);
-          res07 = _mm_packs_epi32(w2, w3);
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
-        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
-        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
-        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
-        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
-        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
-        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
-        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
-      }
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        // Store results
-        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
-        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
-        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
-        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
-        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
-        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
-        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
-        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
-      }
-      out += 8*16;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
-}
-
-static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
-                                     __m128i *in1, int stride) {
-  // load first 8 columns
-  load_buffer_8x8_avx2(input, in0, stride);
-  load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
-
-  input += 8;
-  // load second 8 columns
-  load_buffer_8x8_avx2(input, in1, stride);
-  load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
-                                      __m128i *in1, int stride) {
-  // write first 8 columns
-  write_buffer_8x8_avx2(output, in0, stride);
-  write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
-  // write second 8 columns
-  output += 8;
-  write_buffer_8x8_avx2(output, in1, stride);
-  write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8_avx2(res0, res0);
-  array_transpose_8x8_avx2(res1, tbuf);
-  array_transpose_8x8_avx2(res0 + 8, res1);
-  array_transpose_8x8_avx2(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
-  // perform rounding operations
-  right_shift_8x8_avx2(res0, 2);
-  right_shift_8x8_avx2(res0 + 8, 2);
-  right_shift_8x8_avx2(res1, 2);
-  right_shift_8x8_avx2(res1 + 8, 2);
-}
-
-void fdct16_8col_avx2(__m128i *in) {
-  // perform 16x16 1-D DCT for 8 columns
-  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  // stage 1
-  i[0] = _mm_add_epi16(in[0], in[15]);
-  i[1] = _mm_add_epi16(in[1], in[14]);
-  i[2] = _mm_add_epi16(in[2], in[13]);
-  i[3] = _mm_add_epi16(in[3], in[12]);
-  i[4] = _mm_add_epi16(in[4], in[11]);
-  i[5] = _mm_add_epi16(in[5], in[10]);
-  i[6] = _mm_add_epi16(in[6], in[9]);
-  i[7] = _mm_add_epi16(in[7], in[8]);
-
-  s[0] = _mm_sub_epi16(in[7], in[8]);
-  s[1] = _mm_sub_epi16(in[6], in[9]);
-  s[2] = _mm_sub_epi16(in[5], in[10]);
-  s[3] = _mm_sub_epi16(in[4], in[11]);
-  s[4] = _mm_sub_epi16(in[3], in[12]);
-  s[5] = _mm_sub_epi16(in[2], in[13]);
-  s[6] = _mm_sub_epi16(in[1], in[14]);
-  s[7] = _mm_sub_epi16(in[0], in[15]);
-
-  p[0] = _mm_add_epi16(i[0], i[7]);
-  p[1] = _mm_add_epi16(i[1], i[6]);
-  p[2] = _mm_add_epi16(i[2], i[5]);
-  p[3] = _mm_add_epi16(i[3], i[4]);
-  p[4] = _mm_sub_epi16(i[3], i[4]);
-  p[5] = _mm_sub_epi16(i[2], i[5]);
-  p[6] = _mm_sub_epi16(i[1], i[6]);
-  p[7] = _mm_sub_epi16(i[0], i[7]);
-
-  u[0] = _mm_add_epi16(p[0], p[3]);
-  u[1] = _mm_add_epi16(p[1], p[2]);
-  u[2] = _mm_sub_epi16(p[1], p[2]);
-  u[3] = _mm_sub_epi16(p[0], p[3]);
-
-  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
-  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
-  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
-  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
-  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
-  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
-  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
-  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
-  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
-  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
-  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
-  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[4] = _mm_packs_epi32(u[4], u[5]);
-  in[8] = _mm_packs_epi32(u[2], u[3]);
-  in[12] = _mm_packs_epi32(u[6], u[7]);
-
-  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[2], v[3]);
-
-  t[0] = _mm_add_epi16(p[4], u[0]);
-  t[1] = _mm_sub_epi16(p[4], u[0]);
-  t[2] = _mm_sub_epi16(p[7], u[1]);
-  t[3] = _mm_add_epi16(p[7], u[1]);
-
-  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
-  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
-  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  in[2] = _mm_packs_epi32(v[0], v[1]);
-  in[6] = _mm_packs_epi32(v[4], v[5]);
-  in[10] = _mm_packs_epi32(v[2], v[3]);
-  in[14] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[2] = _mm_packs_epi32(v[0], v[1]);
-  t[3] = _mm_packs_epi32(v[2], v[3]);
-  t[4] = _mm_packs_epi32(v[4], v[5]);
-  t[5] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 3
-  p[0] = _mm_add_epi16(s[0], t[3]);
-  p[1] = _mm_add_epi16(s[1], t[2]);
-  p[2] = _mm_sub_epi16(s[1], t[2]);
-  p[3] = _mm_sub_epi16(s[0], t[3]);
-  p[4] = _mm_sub_epi16(s[7], t[4]);
-  p[5] = _mm_sub_epi16(s[6], t[5]);
-  p[6] = _mm_add_epi16(s[6], t[5]);
-  p[7] = _mm_add_epi16(s[7], t[4]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
-  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
-  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
-  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
-  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
-  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[1] = _mm_packs_epi32(v[0], v[1]);
-  t[2] = _mm_packs_epi32(v[2], v[3]);
-  t[5] = _mm_packs_epi32(v[4], v[5]);
-  t[6] = _mm_packs_epi32(v[6], v[7]);
-
-  // stage 5
-  s[0] = _mm_add_epi16(p[0], t[1]);
-  s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_sub_epi16(p[3], t[2]);
-  s[3] = _mm_add_epi16(p[3], t[2]);
-  s[4] = _mm_add_epi16(p[4], t[5]);
-  s[5] = _mm_sub_epi16(p[4], t[5]);
-  s[6] = _mm_sub_epi16(p[7], t[6]);
-  s[7] = _mm_add_epi16(p[7], t[6]);
-
-  // stage 6
-  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
-  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
-  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
-  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
-  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
-  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
-  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
-  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
-  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
-  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
-  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
-  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
-  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
-  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
-  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[1]  = _mm_packs_epi32(v[0], v[1]);
-  in[9]  = _mm_packs_epi32(v[2], v[3]);
-  in[5]  = _mm_packs_epi32(v[4], v[5]);
-  in[13] = _mm_packs_epi32(v[6], v[7]);
-  in[3]  = _mm_packs_epi32(v[8], v[9]);
-  in[11] = _mm_packs_epi32(v[10], v[11]);
-  in[7]  = _mm_packs_epi32(v[12], v[13]);
-  in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-void fadst16_8col_avx2(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void fdct16_avx2(__m128i *in0, __m128i *in1) {
-  fdct16_8col_avx2(in0);
-  fdct16_8col_avx2(in1);
-  array_transpose_16x16_avx2(in0, in1);
-}
-
-void fadst16_avx2(__m128i *in0, __m128i *in1) {
-  fadst16_8col_avx2(in0);
-  fadst16_8col_avx2(in1);
-  array_transpose_16x16_avx2(in0, in1);
-}
-
-void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
-                      int stride, int tx_type) {
-  __m128i in0[16], in1[16];
-
-  switch (tx_type) {
-    case DCT_DCT:
-      vp9_fdct16x16_avx2(input, output, stride);
-      break;
-    case ADST_DCT:
-      load_buffer_16x16_avx2(input, in0, in1, stride);
-      fadst16_avx2(in0, in1);
-      right_shift_16x16_avx2(in0, in1);
-      fdct16_avx2(in0, in1);
-      write_buffer_16x16_avx2(output, in0, in1, 16);
-      break;
-    case DCT_ADST:
-      load_buffer_16x16_avx2(input, in0, in1, stride);
-      fdct16_avx2(in0, in1);
-      right_shift_16x16_avx2(in0, in1);
-      fadst16_avx2(in0, in1);
-      write_buffer_16x16_avx2(output, in0, in1, 16);
-      break;
-    case ADST_ADST:
-      load_buffer_16x16_avx2(input, in0, in1, stride);
-      fadst16_avx2(in0, in1);
-      right_shift_16x16_avx2(in0, in1);
-      fadst16_avx2(in0, in1);
-      write_buffer_16x16_avx2(output, in0, in1, 16);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
 
 #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
index f31b176e5..1feed6256 100644
--- a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
@@ -31,7 +31,7 @@ void vp9_sad32x32x4d_avx2(uint8_t *src,
   sum_ref3 = _mm256_set1_epi16(0);
   for (i = 0; i < 32 ; i++) {
     // load src and all refs
-    src_reg = _mm256_load_si256((__m256i *)(src));
+    src_reg = _mm256_loadu_si256((__m256i *)(src));
     ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
     ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
     ref2_reg = _mm256_loadu_si256((__m256i *) (ref2));
@@ -103,8 +103,8 @@ void vp9_sad64x64x4d_avx2(uint8_t *src,
   sum_ref3 = _mm256_set1_epi16(0);
   for (i = 0; i < 64 ; i++) {
     // load 64 bytes from src and all refs
-    src_reg = _mm256_load_si256((__m256i *)(src));
-    srcnext_reg = _mm256_load_si256((__m256i *)(src + 32));
+    src_reg = _mm256_loadu_si256((__m256i *)(src));
+    srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32));
     ref0_reg = _mm256_loadu_si256((__m256i *) (ref0));
     ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32));
     ref1_reg = _mm256_loadu_si256((__m256i *) (ref1));
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
index 34ed1867f..a441cadaf 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -67,7 +67,7 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
 #define LOAD_SRC_DST \
   /* load source and destination */ \
   src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
-  dst_reg = _mm256_load_si256((__m256i const *) (dst));
+  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
 
 #define AVG_NEXT_SRC(src_reg, size_stride) \
   src_next_reg = _mm256_loadu_si256((__m256i const *) \
@@ -333,7 +333,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
     if (y_offset == 0) {
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -347,7 +347,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expend each byte to 2 bytes
@@ -369,7 +369,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         MERGE_NEXT_SRC(src_reg, src_stride)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_reg, zero_reg)
@@ -385,7 +385,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
       for (i = 0; i < height ; i++) {
         LOAD_SRC_DST
         AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -409,7 +409,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         AVG_NEXT_SRC(src_reg, 1)
         // average between previous average to current average
         src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         sec+= sec_stride;
         // expand each byte to 2 bytes
@@ -437,7 +437,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         MERGE_WITH_SRC(src_avg, src_reg)
         FILTER_SRC(filter)
         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
         // expand each byte to 2 bytes
         MERGE_WITH_SRC(src_avg, zero_reg)
@@ -459,7 +459,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         MERGE_NEXT_SRC(src_reg, 1)
         FILTER_SRC(filter)
         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
         MERGE_WITH_SRC(src_reg, zero_reg)
         sec+= sec_stride;
@@ -487,7 +487,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
         // average between previous pack to the current
         src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         sec+= sec_stride;
         MERGE_WITH_SRC(src_pack, zero_reg)
@@ -524,7 +524,7 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
         // filter the source
         FILTER_SRC(yfilter)
         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
         MERGE_WITH_SRC(src_pack, zero_reg)
         src_pack = src_reg;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index ba9750032..86ad00501 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -67,7 +67,6 @@ VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 3d0e6382b..11b8205d9 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -39,6 +39,8 @@ struct vp9_extracfg {
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
+  vpx_bit_depth_t             bit_depth;
+  vp9e_tune_content           content;
 };
 
 struct extraconfig_map {
@@ -68,6 +70,8 @@ static const struct extraconfig_map extracfg_map[] = {
       0,                          // frame_parallel_decoding_mode
       NO_AQ,                      // aq_mode
       0,                          // frame_periodic_delta_q
+      VPX_BITS_8,                 // Bit depth
+      VP9E_CONTENT_DEFAULT        // content
     }
   }
 };
@@ -174,7 +178,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
 
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
 
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
   if (cfg->ss_number_layers > 1) {
     unsigned int i, alt_ref_sum = 0;
     for (i = 0; i < cfg->ss_number_layers; ++i) {
@@ -219,6 +223,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, cq_level, 0, 63);
   RANGE_CHECK(cfg, g_bit_depth, VPX_BITS_8, VPX_BITS_12);
   RANGE_CHECK(cfg, g_in_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content,
+              VP9E_CONTENT_DEFAULT, VP9E_CONTENT_INVALID - 1);
 
   // TODO(yaowu): remove this when ssim tuning is implemented for vp9
   if (extra_cfg->tuning == VP8_TUNE_SSIM)
@@ -312,6 +318,7 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
     default:
       ERROR("Invalid image format. Only YV12, I420, I422, I444 images are "
             "supported.");
+      break;
   }
 
   if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
@@ -320,6 +327,20 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static int get_image_bps(const vpx_image_t *img) {
+  switch (img->fmt) {
+    case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_I420: return 12;
+    case VPX_IMG_FMT_I422: return 16;
+    case VPX_IMG_FMT_I444: return 24;
+    case VPX_IMG_FMT_I42016: return 24;
+    case VPX_IMG_FMT_I42216: return 32;
+    case VPX_IMG_FMT_I44416: return 48;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
 static vpx_codec_err_t set_encoder_config(
     VP9EncoderConfig *oxcf,
     const vpx_codec_enc_cfg_t *cfg,
@@ -330,19 +351,22 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->bit_depth = cfg->g_bit_depth;
   oxcf->in_bit_depth = cfg->g_in_bit_depth;
   // guess a frame rate if out of whack, use 30
-  oxcf->framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (oxcf->framerate > 180)
-    oxcf->framerate = 30;
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (oxcf->init_framerate > 180)
+    oxcf->init_framerate = 30;
 
   switch (cfg->g_pass) {
     case VPX_RC_ONE_PASS:
       oxcf->mode = ONE_PASS_GOOD;
+      oxcf->pass = 0;
       break;
     case VPX_RC_FIRST_PASS:
       oxcf->mode = TWO_PASS_FIRST;
+      oxcf->pass = 1;
       break;
     case VPX_RC_LAST_PASS:
       oxcf->mode = TWO_PASS_SECOND_BEST;
+      oxcf->pass = 2;
       break;
   }
 
@@ -401,6 +425,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->arnr_type       = extra_cfg->arnr_type;
 
   oxcf->tuning = extra_cfg->tuning;
+  oxcf->content = extra_cfg->content;
 
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows    = extra_cfg->tile_rows;
@@ -418,7 +443,7 @@ static vpx_codec_err_t set_encoder_config(
     int i;
     for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
       oxcf->ss_target_bitrate[i] =  1000 * cfg->ss_target_bitrate[i];
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
       oxcf->ss_play_alternate[i] =  cfg->ss_enable_auto_alt_ref[i];
 #endif
     }
@@ -687,22 +712,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     priv->extra_cfg = extracfg_map[i].cfg;
     priv->extra_cfg.pkt_list = &priv->pkt_list.head;
 
-     // Maximum buffer size approximated based on having multiple ARF.
-    priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
-
-#if CONFIG_VP9_HIGH
-    if (ctx->init_flags&VPX_CODEC_USE_HIGH) {
-      priv->cx_data_sz <<= 1;
-    }
-#endif
-
-    if (priv->cx_data_sz < 4096)
-      priv->cx_data_sz = 4096;
-
-    priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz);
-    if (priv->cx_data == NULL)
-      return VPX_CODEC_MEM_ERROR;
-
     vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -823,6 +832,20 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
   return index_sz;
 }
 
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
+                                       int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                       const vpx_image_t *img,
                                       vpx_codec_pts_t pts,
@@ -830,9 +853,26 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                       vpx_enc_frame_flags_t flags,
                                       unsigned long deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
+  const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
 
-  if (img)
+  if (img != NULL) {
     res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+      // There's no codec control for multiple alt-refs so check the encoder
+      // instance for its status to determine the compressed data size.
+      ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
+                        get_image_bps(img) / 8 *
+                        (ctx->cpi->multi_arf_allowed ? 8 : 2);
+      if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
+
+      ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+      if (ctx->cx_data == NULL) {
+        return VPX_CODEC_MEM_ERROR;
+      }
+    }
+  }
 
   pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
@@ -859,7 +899,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
   if (res == VPX_CODEC_OK && ctx->cpi != NULL) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp, dst_end_time_stamp;
+    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timebase, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
@@ -867,12 +909,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR)
       ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1;
 
-    /* vp9 use 10,000,000 ticks/second as time stamp */
-    dst_time_stamp = (pts * 10000000 * ctx->cfg.g_timebase.num)
-                     / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
-                         ctx->cfg.g_timebase.den;
-
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
 
@@ -909,19 +945,18 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
                                          cx_data, &dst_time_stamp,
                                          &dst_end_time_stamp, !img)) {
       if (size) {
-        vpx_codec_pts_t round, delta;
-        vpx_codec_cx_pkt_t pkt;
         VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+        vpx_codec_cx_pkt_t pkt;
 
-#ifdef CONFIG_SPATIAL_SVC
-        if (cpi->use_svc && cpi->svc.number_temporal_layers == 1)
+#if CONFIG_SPATIAL_SVC
+        if (is_spatial_svc(cpi))
           cpi->svc.layer_context[cpi->svc.spatial_layer_id].layer_size += size;
 #endif
 
         // Pack invisible frames with the next visible frame
         if (cpi->common.show_frame == 0
-#ifdef CONFIG_SPATIAL_SVC
-            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+#if CONFIG_SPATIAL_SVC
+            || (is_spatial_svc(cpi) &&
                 cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
 #endif
             ) {
@@ -936,20 +971,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
         }
 
         // Add the frame packet to the list of returned packets.
-        round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1;
-        delta = (dst_end_time_stamp - dst_time_stamp);
         pkt.kind = VPX_CODEC_CX_FRAME_PKT;
-        pkt.data.frame.pts =
-          (dst_time_stamp * ctx->cfg.g_timebase.den + round)
-          / ctx->cfg.g_timebase.num / 10000000;
-        pkt.data.frame.duration = (unsigned long)
-          ((delta * ctx->cfg.g_timebase.den + round)
-          / ctx->cfg.g_timebase.num / 10000000);
+        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+        pkt.data.frame.duration = ticks_to_timebase_units(
+            timebase, dst_end_time_stamp - dst_time_stamp);
         pkt.data.frame.flags = lib_flags << 16;
 
         if (lib_flags & FRAMEFLAGS_KEY
-#ifdef CONFIG_SPATIAL_SVC
-            || (cpi->use_svc && cpi->svc.number_temporal_layers == 1 &&
+#if CONFIG_SPATIAL_SVC
+            || (is_spatial_svc(cpi) &&
                 cpi->svc.layer_context[0].is_key_frame)
 #endif
             )
@@ -962,9 +992,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
           // prior PTS so that if a decoder uses pts to schedule when
           // to do this, we start right after last frame was decoded.
           // Invisible frames have no duration.
-          pkt.data.frame.pts = ((cpi->last_time_stamp_seen
-                                 * ctx->cfg.g_timebase.den + round)
-                                / ctx->cfg.g_timebase.num / 10000000) + 1;
+          pkt.data.frame.pts =
+              ticks_to_timebase_units(timebase, cpi->last_time_stamp_seen) + 1;
           pkt.data.frame.duration = 0;
         }
 
@@ -990,8 +1019,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
         vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
         cx_data += size;
         cx_data_sz -= size;
-#ifdef CONFIG_SPATIAL_SVC
-        if (cpi->use_svc && cpi->svc.number_temporal_layers == 1) {
+#if CONFIG_SPATIAL_SVC
+        if (is_spatial_svc(cpi)) {
           vpx_codec_cx_pkt_t pkt = {0};
           int i;
           pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
@@ -1051,9 +1080,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
   vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
 
   if (frame != NULL) {
-    YV12_BUFFER_CONFIG *fb;
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
 
-    vp9_get_reference_enc(ctx->cpi, frame->idx, &fb);
     yuvconfig2image(&frame->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
@@ -1221,6 +1250,13 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(VP9E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
@@ -1253,6 +1289,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP9E_SET_SVC,                      ctrl_set_svc},
   {VP9E_SET_SVC_PARAMETERS,           ctrl_set_svc_parameters},
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
+  {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
@@ -1314,9 +1351,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
       9999,               // kf_max_dist
 
       VPX_SS_DEFAULT_LAYERS,  // ss_number_layers
-#ifdef CONFIG_SPATIAL_SVC
       {0},
-#endif
       {0},                    // ss_target_bitrate
       1,                      // ts_number_layers
       {0},                    // ts_target_bitrate
@@ -1328,7 +1363,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 #endif
     }
   },
-  { -1, {NOT_IMPLEMENTED}}
 };
 
 #ifndef VERSION_STRING
@@ -1346,8 +1380,6 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
   encoder_init,       // vpx_codec_init_fn_t
   encoder_destroy,    // vpx_codec_destroy_fn_t
   encoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
-  NOT_IMPLEMENTED,    // vpx_codec_get_mmap_fn_t
-  NOT_IMPLEMENTED,    // vpx_codec_set_mmap_fn_t
   {  // NOLINT
     NOT_IMPLEMENTED,  // vpx_codec_peek_si_fn_t
     NOT_IMPLEMENTED,  // vpx_codec_get_si_fn_t
@@ -1356,6 +1388,7 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
     NOT_IMPLEMENTED   // vpx_codec_set_fb_fn_t
   },
   {  // NOLINT
+    1,                      // 1 cfg map
     encoder_usage_cfg_map,  // vpx_codec_enc_cfg_map_t
     encoder_encode,         // vpx_codec_encode_fn_t
     encoder_get_cxdata,     // vpx_codec_get_cx_data_fn_t
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index a2f77b11d..28475f39e 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -41,6 +41,7 @@ struct vpx_codec_alg_priv {
   void                   *decrypt_state;
   vpx_image_t             img;
   int                     img_avail;
+  int                     flushed;
   int                     invert_tile_order;
   int                     frame_parallel_decode;  // frame-based threading.
 
@@ -70,6 +71,7 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
     ctx->priv->alg_priv = alg_priv;
     ctx->priv->alg_priv->si.sz = sizeof(ctx->priv->alg_priv->si);
     ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->alg_priv->flushed = 0;
     ctx->priv->alg_priv->frame_parallel_decode =
         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
 
@@ -174,6 +176,7 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
       vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
     } else {
       intra_only_flag = show_frame ? 0 : vp9_rb_read_bit(&rb);
+
       rb.bit_offset += error_resilient ? 0 : 2;  // reset_frame_context
 
       if (intra_only_flag) {
@@ -413,8 +416,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   uint32_t frame_sizes[8];
   int frame_count;
 
-  if (data == NULL || data_sz == 0)
-    return VPX_CODEC_INVALID_PARAM;
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return VPX_CODEC_OK;
+  }
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
 
   res = parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
                                ctx->decrypt_cb, ctx->decrypt_state);
@@ -578,9 +586,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
   if (data) {
-    YV12_BUFFER_CONFIG* fb;
+    YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx);
+    if (fb == NULL) return VPX_CODEC_ERROR;
 
-    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
@@ -634,11 +642,10 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
-  if (corrupted) {
-    if (ctx->pbi)
-      *corrupted = ctx->pbi->common.frame_to_show->corrupted;
-    else
-      return VPX_CODEC_ERROR;
+  if (corrupted != NULL && ctx->pbi != NULL) {
+    const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show;
+    if (frame == NULL) return VPX_CODEC_ERROR;
+    *corrupted = frame->corrupted;
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -729,8 +736,6 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   decoder_init,       // vpx_codec_init_fn_t
   decoder_destroy,    // vpx_codec_destroy_fn_t
   decoder_ctrl_maps,  // vpx_codec_ctrl_fn_map_t
-  NOT_IMPLEMENTED,    // vpx_codec_get_mmap_fn_t
-  NOT_IMPLEMENTED,    // vpx_codec_set_mmap_fn_t
   { // NOLINT
     decoder_peek_si,    // vpx_codec_peek_si_fn_t
     decoder_get_si,     // vpx_codec_get_si_fn_t
@@ -739,6 +744,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
     decoder_set_fb_fn,  // vpx_codec_set_fb_fn_t
   },
   { // NOLINT
+    0,
     NOT_IMPLEMENTED,  // vpx_codec_enc_cfg_map_t
     NOT_IMPLEMENTED,  // vpx_codec_encode_fn_t
     NOT_IMPLEMENTED,  // vpx_codec_get_cx_data_fn_t
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index f21f00307..bb37201f6 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -41,11 +41,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer;
+  img->planes[VPX_PLANE_ALPHA] = NULL;
   img->stride[VPX_PLANE_Y] = yv12->y_stride;
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
 #if CONFIG_VP9_HIGH
   if (yv12->flags & YV12_FLAG_HIGH) {
     // VPX IMG uses byte strides and a pointer to the first byte of the image
@@ -57,12 +57,11 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
         CONVERT_TO_SHORTPTR(yv12->u_buffer);
     img->planes[VPX_PLANE_V]     = (uint8_t*)
         CONVERT_TO_SHORTPTR(yv12->v_buffer);
-    img->planes[VPX_PLANE_ALPHA] = (uint8_t*)
-        CONVERT_TO_SHORTPTR(yv12->alpha_buffer);
+    img->planes[VPX_PLANE_ALPHA] = NULL;
     img->stride[VPX_PLANE_Y]     = 2 * yv12->y_stride;
     img->stride[VPX_PLANE_U]     = 2 * yv12->uv_stride;
     img->stride[VPX_PLANE_V]     = 2 * yv12->uv_stride;
-    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->alpha_stride;
+    img->stride[VPX_PLANE_ALPHA] = 2 * yv12->y_stride;;
   }
 #endif
   img->bps = bps;
@@ -77,7 +76,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->y_buffer = img->planes[VPX_PLANE_Y];
   yv12->u_buffer = img->planes[VPX_PLANE_U];
   yv12->v_buffer = img->planes[VPX_PLANE_V];
-  yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA];
 
   yv12->y_crop_width  = img->d_w;
   yv12->y_crop_height = img->d_h;
@@ -89,12 +87,8 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
                                              : yv12->y_height;
 
-  yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0;
-  yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0;
-
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
-  yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
 #if CONFIG_VP9_HIGH
   if (img->fmt & VPX_IMG_FMT_HIGH) {
     // In vpx_image_t
@@ -114,7 +108,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
     yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
     yv12->y_stride >>= 1;
     yv12->uv_stride >>= 1;
-    yv12->alpha_stride >>= 1;
     yv12->flags = YV12_FLAG_HIGH;
   } else {
     yv12->flags = 0;
@@ -122,13 +115,6 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
   yv12->border  = (yv12->y_stride - img->w) / 2;
 #else
   yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-#endif
-#if CONFIG_ALPHA
-  // For development purposes, force alpha to hold the same data as Y for now.
-  yv12->alpha_buffer = yv12->y_buffer;
-  yv12->alpha_width = yv12->y_width;
-  yv12->alpha_height = yv12->y_height;
-  yv12->alpha_stride = yv12->y_stride;
 #endif
   return VPX_CODEC_OK;
 }
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3f2f5b9d9..dc46c4e35 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -23,8 +23,8 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
-VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.c
-VP9_CX_SRCS-$(CONFIG_DENOISING) += encoder/vp9_denoiser.h
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
+VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
@@ -130,5 +130,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 82d2bc3c0..a7716d130 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -248,37 +248,6 @@ typedef vpx_codec_err_t (*vpx_codec_set_fb_fn_t)(
     vpx_get_frame_buffer_cb_fn_t cb_get,
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
 
-/*\brief eXternal Memory Allocation memory map get iterator
- *
- * Iterates over a list of the memory maps requested by the decoder. The
- * iterator storage should be initialized to NULL to start the iteration.
- * Iteration is complete when this function returns NULL.
- *
- * \param[in out] iter     Iterator storage, initialized to NULL
- *
- * \return Returns a pointer to an memory segment descriptor, or NULL to
- *         indicate end-of-list.
- */
-typedef vpx_codec_err_t (*vpx_codec_get_mmap_fn_t)(const vpx_codec_ctx_t      *ctx,
-                                                   vpx_codec_mmap_t           *mmap,
-                                                   vpx_codec_iter_t           *iter);
-
-
-/*\brief eXternal Memory Allocation memory map set iterator
- *
- * Sets a memory descriptor inside the decoder instance.
- *
- * \param[in] ctx      Pointer to this instance's context
- * \param[in] mmap     Memory map to store.
- *
- * \retval #VPX_CODEC_OK
- *     The memory map was accepted and stored.
- * \retval #VPX_CODEC_MEM_ERROR
- *     The memory map was rejected.
- */
-typedef vpx_codec_err_t (*vpx_codec_set_mmap_fn_t)(vpx_codec_ctx_t         *ctx,
-                                                   const vpx_codec_mmap_t  *mmap);
-
 
 typedef vpx_codec_err_t (*vpx_codec_encode_fn_t)(vpx_codec_alg_priv_t  *ctx,
                                                  const vpx_image_t     *img,
@@ -330,8 +299,6 @@ struct vpx_codec_iface {
   vpx_codec_init_fn_t       init;    /**< \copydoc ::vpx_codec_init_fn_t */
   vpx_codec_destroy_fn_t    destroy;     /**< \copydoc ::vpx_codec_destroy_fn_t */
   vpx_codec_ctrl_fn_map_t  *ctrl_maps;   /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
-  vpx_codec_get_mmap_fn_t   get_mmap;    /**< \copydoc ::vpx_codec_get_mmap_fn_t */
-  vpx_codec_set_mmap_fn_t   set_mmap;    /**< \copydoc ::vpx_codec_set_mmap_fn_t */
   struct vpx_codec_dec_iface {
     vpx_codec_peek_si_fn_t    peek_si;     /**< \copydoc ::vpx_codec_peek_si_fn_t */
     vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
@@ -340,6 +307,7 @@ struct vpx_codec_iface {
     vpx_codec_set_fb_fn_t     set_fb_fn;   /**< \copydoc ::vpx_codec_set_fb_fn_t */
   } dec;
   struct vpx_codec_enc_iface {
+    int                                cfg_map_count;
     vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
     vpx_codec_encode_fn_t              encode;        /**< \copydoc ::vpx_codec_encode_fn_t */
     vpx_codec_get_cx_data_fn_t         get_cx_data;   /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
@@ -401,11 +369,11 @@ struct vpx_codec_priv_enc_mr_cfg
 
 #undef VPX_CTRL_USE_TYPE
 #define VPX_CTRL_USE_TYPE(id, typ) \
-  static typ id##__value(va_list args) {return va_arg(args, typ);} \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
 
 #undef VPX_CTRL_USE_TYPE_DEPRECATED
 #define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ) \
-  static typ id##__value(va_list args) {return va_arg(args, typ);} \
+  static VPX_INLINE typ id##__value(va_list args) {return va_arg(args, typ);}
 
 #define CAST(id, arg) id##__value(arg)
 
@@ -463,54 +431,11 @@ struct vpx_internal_error_info {
   jmp_buf          jmp;
 };
 
-static void vpx_internal_error(struct vpx_internal_error_info *info,
-                               vpx_codec_err_t                 error,
-                               const char                     *fmt,
-                               ...) {
-  va_list ap;
+void vpx_internal_error(struct vpx_internal_error_info *info,
+                        vpx_codec_err_t                 error,
+                        const char                     *fmt,
+                        ...);
 
-  info->error_code = error;
-  info->has_detail = 0;
-
-  if (fmt) {
-    size_t  sz = sizeof(info->detail);
-
-    info->has_detail = 1;
-    va_start(ap, fmt);
-    vsnprintf(info->detail, sz - 1, fmt, ap);
-    va_end(ap);
-    info->detail[sz - 1] = '\0';
-  }
-
-  if (info->setjmp)
-    longjmp(info->jmp, info->error_code);
-}
-
-//------------------------------------------------------------------------------
-// mmap interface
-
-typedef struct {
-  unsigned int   id;
-  unsigned long  sz;
-  unsigned int   align;
-  unsigned int   flags;
-  unsigned long (*calc_sz)(const vpx_codec_dec_cfg_t *, vpx_codec_flags_t);
-} mem_req_t;
-
-// Allocates mmap.priv and sets mmap.base based on mmap.sz/align/flags
-// requirements.
-// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
-vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap);
-
-// Frees mmap.base allocated by a call to vpx_mmap_alloc().
-void vpx_mmap_dtor(vpx_codec_mmap_t *mmap);
-
-// Checks each mmap has the size requirement specificied by mem_reqs.
-// Returns #VPX_CODEC_OK on success, #VPX_CODEC_MEM_ERROR otherwise.
-vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
-                                   const vpx_codec_mmap_t *mmaps,
-                                   const mem_req_t *mem_reqs, int nreqs,
-                                   vpx_codec_flags_t init_flags);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 13abad2e7..ecd88431d 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -517,8 +517,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     }
   }
 
+#if CONFIG_SPATIAL_SVC
   for (i = 0; i < si->layers; ++i)
     enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
+#endif
 
   // modify encoder configuration
   enc_cfg->ss_number_layers = si->layers;
@@ -724,12 +726,14 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
         si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
         break;
       }
+#if CONFIG_SPATIAL_SVC
       case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
         int i;
         for (i = 0; i < si->layers; ++i)
           si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
         break;
       }
+#endif
       default: {
         break;
       }
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 6fb8f522d..d175eae64 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -135,50 +135,25 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t  *ctx,
   return SAVE_STATUS(ctx, res);
 }
 
-//------------------------------------------------------------------------------
-// mmap interface
+void vpx_internal_error(struct vpx_internal_error_info *info,
+                        vpx_codec_err_t                 error,
+                        const char                     *fmt,
+                        ...) {
+  va_list ap;
 
-vpx_codec_err_t vpx_mmap_alloc(vpx_codec_mmap_t *mmap) {
-  unsigned int align = mmap->align ? mmap->align - 1 : 0;
+  info->error_code = error;
+  info->has_detail = 0;
 
-  if (mmap->flags & VPX_CODEC_MEM_ZERO)
-    mmap->priv = calloc(1, mmap->sz + align);
-  else
-    mmap->priv = malloc(mmap->sz + align);
+  if (fmt) {
+    size_t  sz = sizeof(info->detail);
 
-  if (mmap->priv == NULL) return VPX_CODEC_MEM_ERROR;
-  mmap->base = (void *)((((uintptr_t)mmap->priv) + align) & ~(uintptr_t)align);
-  mmap->dtor = vpx_mmap_dtor;
-  return VPX_CODEC_OK;
-}
-
-void vpx_mmap_dtor(vpx_codec_mmap_t *mmap) {
-  free(mmap->priv);
-}
-
-vpx_codec_err_t vpx_validate_mmaps(const vpx_codec_stream_info_t *si,
-                                   const vpx_codec_mmap_t *mmaps,
-                                   const mem_req_t *mem_reqs, int nreqs,
-                                   vpx_codec_flags_t init_flags) {
-  int i;
-
-  for (i = 0; i < nreqs - 1; ++i) {
-    /* Ensure the segment has been allocated */
-    if (mmaps[i].base == NULL) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    /* Verify variable size segment is big enough for the current si. */
-    if (mem_reqs[i].calc_sz != NULL) {
-      vpx_codec_dec_cfg_t cfg;
-
-      cfg.w = si->w;
-      cfg.h = si->h;
-
-      if (mmaps[i].sz < mem_reqs[i].calc_sz(&cfg, init_flags)) {
-        return VPX_CODEC_MEM_ERROR;
-      }
-    }
+    info->has_detail = 1;
+    va_start(ap, fmt);
+    vsnprintf(info->detail, sz - 1, fmt, ap);
+    va_end(ap);
+    info->detail[sz - 1] = '\0';
   }
-  return VPX_CODEC_OK;
+
+  if (info->setjmp)
+    longjmp(info->jmp, info->error_code);
 }
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 63fdaf308..4d22a0847 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -31,8 +31,6 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_INVALID_PARAM;
   else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION)
     res = VPX_CODEC_ABI_MISMATCH;
-  else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA))
-    res = VPX_CODEC_INCAPABLE;
   else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC))
     res = VPX_CODEC_INCAPABLE;
   else if ((flags & VPX_CODEC_USE_ERROR_CONCEALMENT) &&
@@ -50,19 +48,15 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t      *ctx,
     ctx->priv = NULL;
     ctx->init_flags = flags;
     ctx->config.dec = cfg;
-    res = VPX_CODEC_OK;
 
-    if (!(flags & VPX_CODEC_USE_XMA)) {
-      res = ctx->iface->init(ctx, NULL);
-
-      if (res) {
-        ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
-        vpx_codec_destroy(ctx);
-      }
-
-      if (ctx->priv)
-        ctx->priv->iface = ctx->iface;
+    res = ctx->iface->init(ctx, NULL);
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      vpx_codec_destroy(ctx);
     }
+
+    if (ctx->priv)
+      ctx->priv->iface = ctx->iface;
   }
 
   return SAVE_STATUS(ctx, res);
@@ -119,7 +113,7 @@ vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
-  if (!ctx || (!data && data_sz))
+  if (!ctx || (!data && data_sz) || (data && !data_sz))
     res = VPX_CODEC_INVALID_PARAM;
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
@@ -183,50 +177,6 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx
   return SAVE_STATUS(ctx, res);
 }
 
-
-vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t                *ctx,
-                                      vpx_codec_mmap_t               *mmap,
-                                      vpx_codec_iter_t               *iter) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
-
-  if (!ctx || !mmap || !iter || !ctx->iface)
-    res = VPX_CODEC_INVALID_PARAM;
-  else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA))
-    res = VPX_CODEC_ERROR;
-  else
-    res = ctx->iface->get_mmap(ctx, mmap, iter);
-
-  return SAVE_STATUS(ctx, res);
-}
-
-
-vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t   *ctx,
-                                      vpx_codec_mmap_t  *mmap,
-                                      unsigned int     num_maps) {
-  vpx_codec_err_t res = VPX_CODEC_MEM_ERROR;
-
-  if (!ctx || !mmap || !ctx->iface)
-    res = VPX_CODEC_INVALID_PARAM;
-  else if (!(ctx->iface->caps & VPX_CODEC_CAP_XMA))
-    res = VPX_CODEC_ERROR;
-  else {
-    unsigned int i;
-
-    for (i = 0; i < num_maps; i++, mmap++) {
-      if (!mmap->base)
-        break;
-
-      /* Everything look ok, set the mmap in the decoder */
-      res = ctx->iface->set_mmap(ctx, mmap);
-
-      if (res)
-        break;
-    }
-  }
-
-  return SAVE_STATUS(ctx, res);
-}
-
 vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
     vpx_codec_ctx_t *ctx, vpx_get_frame_buffer_cb_fn_t cb_get,
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index ea1b346c4..cd88a8480 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -35,8 +35,6 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_ABI_MISMATCH;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA))
-    res = VPX_CODEC_INCAPABLE;
   else if ((flags & VPX_CODEC_USE_PSNR)
            && !(iface->caps & VPX_CODEC_CAP_PSNR))
     res = VPX_CODEC_INCAPABLE;
@@ -83,8 +81,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t      *ctx,
     res = VPX_CODEC_ABI_MISMATCH;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
-  else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA))
-    res = VPX_CODEC_INCAPABLE;
   else if ((flags & VPX_CODEC_USE_PSNR)
            && !(iface->caps & VPX_CODEC_CAP_PSNR))
     res = VPX_CODEC_INCAPABLE;
@@ -165,6 +161,7 @@ vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
                                               unsigned int          usage) {
   vpx_codec_err_t res;
   vpx_codec_enc_cfg_map_t *map;
+  int i;
 
   if (!iface || !cfg || usage > INT_MAX)
     res = VPX_CODEC_INVALID_PARAM;
@@ -173,7 +170,8 @@ vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
   else {
     res = VPX_CODEC_INVALID_PARAM;
 
-    for (map = iface->enc.cfg_maps; map->usage >= 0; map++) {
+    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
+      map = iface->enc.cfg_maps + i;
       if (map->usage == (int)usage) {
         *cfg = map->cfg;
         cfg->g_usage = usage;
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 3995c8c70..796a7a1c2 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -206,6 +206,7 @@ enum vp8e_enc_control_id {
    *                     temporal layer.
    */
   VP9E_SET_SVC_LAYER_ID,
+  VP9E_SET_TUNE_CONTENT
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -277,6 +278,12 @@ typedef enum {
   VP8_EIGHT_TOKENPARTITION = 3
 } vp8e_token_partitions;
 
+/*!brief VP9 encoder content type */
+typedef enum {
+  VP9E_CONTENT_DEFAULT,
+  VP9E_CONTENT_SCREEN,
+  VP9E_CONTENT_INVALID
+} vp9e_tune_content;
 
 /*!\brief VP8 model tuning parameters
  *
@@ -370,6 +377,7 @@ VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 
+VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 45e702354..07df72a78 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -153,7 +153,6 @@ extern "C" {
   typedef long vpx_codec_caps_t;
 #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
 #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
-#define VPX_CODEC_CAP_XMA     0x4 /**< Supports eXternal Memory Allocation */
 
 
   /*! \brief Initialization-time Feature Enabling
@@ -164,7 +163,6 @@ extern "C" {
    *  The available flags are specified by VPX_CODEC_USE_* defines.
    */
   typedef long vpx_codec_flags_t;
-#define VPX_CODEC_USE_XMA 0x00000001    /**< Use eXternal Memory Allocation mode */
 
 
   /*!\brief Codec interface structure.
@@ -471,94 +469,6 @@ extern "C" {
 
 #endif
 
-
-  /*!\defgroup cap_xma External Memory Allocation Functions
-   *
-   * The following functions are required to be implemented for all codecs
-   * that advertise the VPX_CODEC_CAP_XMA capability. Calling these functions
-   * for codecs that don't advertise this capability will result in an error
-   * code being returned, usually VPX_CODEC_INCAPABLE
-   * @{
-   */
-
-
-  /*!\brief Memory Map Entry
-   *
-   * This structure is used to contain the properties of a memory segment. It
-   * is populated by the codec in the request phase, and by the calling
-   * application once the requested allocation has been performed.
-   */
-  typedef struct vpx_codec_mmap {
-    /*
-     * The following members are set by the codec when requesting a segment
-     */
-    unsigned int   id;     /**< identifier for the segment's contents */
-    unsigned long  sz;     /**< size of the segment, in bytes */
-    unsigned int   align;  /**< required alignment of the segment, in bytes */
-    unsigned int   flags;  /**< bitfield containing segment properties */
-#define VPX_CODEC_MEM_ZERO     0x1  /**< Segment must be zeroed by allocation */
-#define VPX_CODEC_MEM_WRONLY   0x2  /**< Segment need not be readable */
-#define VPX_CODEC_MEM_FAST     0x4  /**< Place in fast memory, if available */
-
-    /* The following members are to be filled in by the allocation function */
-    void          *base;   /**< pointer to the allocated segment */
-    void (*dtor)(struct vpx_codec_mmap *map);         /**< destructor to call */
-    void          *priv;   /**< allocator private storage */
-  } vpx_codec_mmap_t; /**< alias for struct vpx_codec_mmap */
-
-
-  /*!\brief Iterate over the list of segments to allocate.
-   *
-   * Iterates over a list of the segments to allocate. The iterator storage
-   * should be initialized to NULL to start the iteration. Iteration is complete
-   * when this function returns VPX_CODEC_LIST_END. The amount of memory needed to
-   * allocate is dependent upon the size of the encoded stream. In cases where the
-   * stream is not available at allocation time, a fixed size must be requested.
-   * The codec will not be able to operate on streams larger than the size used at
-   * allocation time.
-   *
-   * \param[in]      ctx     Pointer to this instance's context.
-   * \param[out]     mmap    Pointer to the memory map entry to populate.
-   * \param[in,out]  iter    Iterator storage, initialized to NULL
-   *
-   * \retval #VPX_CODEC_OK
-   *     The memory map entry was populated.
-   * \retval #VPX_CODEC_ERROR
-   *     Codec does not support XMA mode.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Unable to determine segment size from stream info.
-   */
-  vpx_codec_err_t vpx_codec_get_mem_map(vpx_codec_ctx_t                *ctx,
-                                        vpx_codec_mmap_t               *mmap,
-                                        vpx_codec_iter_t               *iter);
-
-
-  /*!\brief Identify allocated segments to codec instance
-   *
-   * Stores a list of allocated segments in the codec. Segments \ref MUST be
-   * passed in the order they are read from vpx_codec_get_mem_map(), but may be
-   * passed in groups of any size. Segments \ref MUST be set only once. The
-   * allocation function \ref MUST ensure that the vpx_codec_mmap_t::base member
-   * is non-NULL. If the segment requires cleanup handling (e.g., calling free()
-   * or close()) then the vpx_codec_mmap_t::dtor member \ref MUST be populated.
-   *
-   * \param[in]      ctx     Pointer to this instance's context.
-   * \param[in]      mmaps   Pointer to the first memory map entry in the list.
-   * \param[in]      num_maps  Number of entries being set at this time
-   *
-   * \retval #VPX_CODEC_OK
-   *     The segment was stored in the codec context.
-   * \retval #VPX_CODEC_INCAPABLE
-   *     Codec does not support XMA mode.
-   * \retval #VPX_CODEC_MEM_ERROR
-   *     Segment base address was not set, or segment was already stored.
-
-   */
-  vpx_codec_err_t  vpx_codec_set_mem_map(vpx_codec_ctx_t   *ctx,
-                                         vpx_codec_mmap_t  *mmaps,
-                                         unsigned int       num_maps);
-
-  /*!@} - end defgroup cap_xma*/
   /*!@} - end defgroup codec*/
 #ifdef __cplusplus
 }
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index ba183283e..10b89fa0f 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -122,10 +122,6 @@ extern "C" {
    * is not thread safe and should be guarded with a lock if being used
    * in a multithreaded context.
    *
-   * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
-   * parameter), the storage pointed to by the cfg parameter must be
-   * kept readable and stable until all memory maps have been set.
-   *
    * \param[in]    ctx     Pointer to this instance's context.
    * \param[in]    iface   Pointer to the algorithm interface to use.
    * \param[in]    cfg     Configuration to use, if known. May be NULL.
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 123502b71..eec7f30f4 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -160,7 +160,7 @@ extern "C" {
     VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
     VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
     VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
     VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
 #endif
     VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
@@ -199,7 +199,7 @@ extern "C" {
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
       struct vpx_fixed_buf raw;     /**< data for arbitrary packets */
-#ifdef CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
 #endif
 
@@ -656,14 +656,12 @@ extern "C" {
      */
     unsigned int           ss_number_layers;
 
-#ifdef CONFIG_SPATIAL_SVC
     /*!\brief Enable auto alt reference flags for each spatial layer.
      *
      * These values specify if auto alt reference frame is enabled for each
      * spatial layer.
      */
     int                    ss_enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-#endif
 
     /*!\brief Target bitrate for each spatial layer.
      *
@@ -723,10 +721,6 @@ extern "C" {
    * is not thread safe and should be guarded with a lock if being used
    * in a multithreaded context.
    *
-   * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
-   * parameter), the storage pointed to by the cfg parameter must be
-   * kept readable and stable until all memory maps have been set.
-   *
    * \param[in]    ctx     Pointer to this instance's context.
    * \param[in]    iface   Pointer to the algorithm interface to use.
    * \param[in]    cfg     Configuration to use, if known. May be NULL.
@@ -760,10 +754,6 @@ extern "C" {
    * instead of this function directly, to ensure that the ABI version number
    * parameter is properly initialized.
    *
-   * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
-   * parameter), the storage pointed to by the cfg parameter must be
-   * kept readable and stable until all memory maps have been set.
-   *
    * \param[in]    ctx     Pointer to this instance's context.
    * \param[in]    iface   Pointer to the algorithm interface to use.
    * \param[in]    cfg     Configuration to use, if known. May be NULL.
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 258618bbd..ffeefb819 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -15,6 +15,15 @@
 /* get ptrdiff_t, size_t, wchar_t, NULL */
 #include <stddef.h>
 
+#if defined(_MSC_VER)
+#define VPX_FORCE_INLINE __forceinline
+#define VPX_INLINE __inline
+#else
+#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+// TODO(jbb): Allow a way to force inline off for older compilers.
+#define VPX_INLINE inline
+#endif
+
 #if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES)
 typedef signed char  int8_t;
 typedef signed short int16_t;
diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index 8c8b52618..d4a3d773f 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h
@@ -133,7 +133,7 @@ static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
 }
 
 #define mem_get_s_generic(end,sz) \
-  static signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\
+  static VPX_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) {\
     const MAU_T *mem = (const MAU_T*)vmem;\
     signed MEM_VALUE_T val = mem_get_##end##sz(mem);\
     return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz);\
@@ -165,7 +165,7 @@ mem_get_s_generic(le, 32)
 
 #undef  mem_put_be16
 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
-static void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >> 8) & 0xff;
@@ -174,7 +174,7 @@ static void mem_put_be16(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_be24
 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
-static void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >> 16) & 0xff;
@@ -184,7 +184,7 @@ static void mem_put_be24(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_be32
 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
-static void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >> 24) & 0xff;
@@ -195,7 +195,7 @@ static void mem_put_be32(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le16
 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
-static void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >>  0) & 0xff;
@@ -204,7 +204,7 @@ static void mem_put_le16(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le24
 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
-static void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >>  0) & 0xff;
@@ -214,7 +214,7 @@ static void mem_put_le24(void *vmem, MEM_VALUE_T val) {
 
 #undef  mem_put_le32
 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
-static void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
   mem[0] = (val >>  0) & 0xff;
diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index 24743c8d6..c16111fec 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h
@@ -44,19 +44,22 @@
 #define swap_endian_32_se(val,raw) swap_endian_32(val,raw)
 
 #define mem_get_ne_aligned_generic(end,sz) \
-  static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\
+  static VPX_INLINE unsigned MEM_VALUE_T \
+    mem_get_##end##sz##_aligned(const void *vmem) {\
     const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\
     return *mem;\
   }
 
 #define mem_get_sne_aligned_generic(end,sz) \
-  static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\
+  static VPX_INLINE signed MEM_VALUE_T \
+    mem_get_s##end##sz##_aligned(const void *vmem) {\
     const int##sz##_t *mem = (const int##sz##_t *)vmem;\
     return *mem;\
   }
 
 #define mem_get_se_aligned_generic(end,sz) \
-  static unsigned MEM_VALUE_T mem_get_##end##sz##_aligned(const void *vmem) {\
+  static VPX_INLINE unsigned MEM_VALUE_T \
+    mem_get_##end##sz##_aligned(const void *vmem) {\
     const uint##sz##_t *mem = (const uint##sz##_t *)vmem;\
     unsigned MEM_VALUE_T val, raw = *mem;\
     swap_endian_##sz(val,raw);\
@@ -64,7 +67,8 @@
   }
 
 #define mem_get_sse_aligned_generic(end,sz) \
-  static signed MEM_VALUE_T mem_get_s##end##sz##_aligned(const void *vmem) {\
+  static VPX_INLINE signed MEM_VALUE_T \
+    mem_get_s##end##sz##_aligned(const void *vmem) {\
     const int##sz##_t *mem = (const int##sz##_t *)vmem;\
     unsigned MEM_VALUE_T val, raw = *mem;\
     swap_endian_##sz##_se(val,raw);\
@@ -72,13 +76,15 @@
   }
 
 #define mem_put_ne_aligned_generic(end,sz) \
-  static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
+  static VPX_INLINE void \
+    mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
     uint##sz##_t *mem = (uint##sz##_t *)vmem;\
     *mem = (uint##sz##_t)val;\
   }
 
 #define mem_put_se_aligned_generic(end,sz) \
-  static void mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
+  static VPX_INLINE void \
+    mem_put_##end##sz##_aligned(void *vmem, MEM_VALUE_T val) {\
     uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;\
     swap_endian_##sz(raw,val);\
     *mem = (uint##sz##_t)raw;\
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 182892acf..8772c6eea 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -73,6 +73,31 @@ static void once(void (*func)(void))
 }
 
 
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void)) {
+  static int done;
+
+  /* If the initialization is complete, return early. */
+  if (done)
+    return;
+
+  /* Causes all other threads in the process to block themselves
+   * and give up their time slice.
+   */
+  DosEnterCritSec();
+
+  if (!done) {
+    func();
+    done = 1;
+  }
+
+  /* Restores normal thread dispatching for the current process. */
+  DosExitCritSec();
+}
+
+
 #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
 #include <pthread.h>
 static void once(void (*func)(void))
diff --git a/vpx_ports/vpx_timer.h b/vpx_ports/vpx_timer.h
index 9e2015e62..870338b4f 100644
--- a/vpx_ports/vpx_timer.h
+++ b/vpx_ports/vpx_timer.h
@@ -53,7 +53,7 @@ struct vpx_usec_timer {
 };
 
 
-static void
+static INLINE void
 vpx_usec_timer_start(struct vpx_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->begin);
@@ -63,7 +63,7 @@ vpx_usec_timer_start(struct vpx_usec_timer *t) {
 }
 
 
-static void
+static INLINE void
 vpx_usec_timer_mark(struct vpx_usec_timer *t) {
 #if defined(_WIN32)
   QueryPerformanceCounter(&t->end);
@@ -73,7 +73,7 @@ vpx_usec_timer_mark(struct vpx_usec_timer *t) {
 }
 
 
-static int64_t
+static INLINE int64_t
 vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
 #if defined(_WIN32)
   LARGE_INTEGER freq, diff;
@@ -101,13 +101,13 @@ struct vpx_usec_timer {
   void *dummy;
 };
 
-static void
+static INLINE void
 vpx_usec_timer_start(struct vpx_usec_timer *t) { }
 
-static void
+static INLINE void
 vpx_usec_timer_mark(struct vpx_usec_timer *t) { }
 
-static long
+static INLINE int
 vpx_usec_timer_elapsed(struct vpx_usec_timer *t) {
   return 0;
 }
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index bc99f89d8..81c2b8b87 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -116,7 +116,7 @@ void __cpuid(int CPUInfo[4], int info_type);
 #define BIT(n) (1<<n)
 #endif
 
-static int
+static INLINE int
 x86_simd_caps(void) {
   unsigned int flags = 0;
   unsigned int mask = ~0;
@@ -172,7 +172,7 @@ x86_simd_caps(void) {
 unsigned __int64 __rdtsc(void);
 #pragma intrinsic(__rdtsc)
 #endif
-static unsigned int
+static INLINE unsigned int
 x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tsc;
@@ -249,9 +249,9 @@ x87_get_control_word(void) {
 }
 #endif
 
-static unsigned short
+static INLINE unsigned int
 x87_set_double_precision(void) {
-  unsigned short mode = x87_get_control_word();
+  unsigned int mode = x87_get_control_word();
   x87_set_control_word((mode&~0x300) | 0x200);
   return mode;
 }
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
deleted file mode 100644
index 696f47a7b..000000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_copy_frame_func_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-;                                   YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_frame_func_neon| PROC
-    push            {r4 - r11, lr}
-    vpush           {d8 - d15}
-
-    sub             sp, sp, #16
-
-    ;Copy Y plane
-    ldr             r8, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r9, [r1, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r10, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
-    ldr             r11, [r1, #yv12_buffer_config_v_buffer]      ;srcptr1
-
-    ldr             r4, [r0, #yv12_buffer_config_y_height]
-    ldr             r5, [r0, #yv12_buffer_config_y_width]
-    ldr             r6, [r0, #yv12_buffer_config_y_stride]
-    ldr             r7, [r1, #yv12_buffer_config_y_stride]
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-
-    str             r8, [sp]
-    str             r9, [sp, #4]
-    str             r10, [sp, #8]
-    str             r11, [sp, #12]
-
-    ; copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r10, r2, r6
-    add             r11, r3, r7
-    movs            r12, r5, lsr #7
-    ble             extra_cp_needed   ; y_width < 128
-
-cp_src_to_dst_width_loop
-    vld1.8          {q0, q1}, [r8]!
-    vld1.8          {q8, q9}, [r10]!
-    vld1.8          {q2, q3}, [r8]!
-    vld1.8          {q10, q11}, [r10]!
-    vld1.8          {q4, q5}, [r8]!
-    vld1.8          {q12, q13}, [r10]!
-    vld1.8          {q6, q7}, [r8]!
-    vld1.8          {q14, q15}, [r10]!
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r9]!
-    vst1.8          {q8, q9}, [r11]!
-    vst1.8          {q2, q3}, [r9]!
-    vst1.8          {q10, q11}, [r11]!
-    vst1.8          {q4, q5}, [r9]!
-    vst1.8          {q12, q13}, [r11]!
-    vst1.8          {q6, q7}, [r9]!
-    vst1.8          {q14, q15}, [r11]!
-
-    bne             cp_src_to_dst_width_loop
-
-    subs            lr, lr, #1
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             cp_src_to_dst_height_loop
-
-extra_cp_needed
-    ands            r10, r5, #0x7f                  ;check to see if extra copy is needed
-    sub             r11, r5, r10
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-    bne             extra_cp_src_to_dst_width
-end_of_cp_src_to_dst
-
-;Copy U & V planes
-    ldr             r2, [sp]        ;srcptr1
-    ldr             r3, [sp, #4]        ;dstptr1
-    mov             r4, r4, lsr #1                  ;src uv_height
-    mov             r5, r5, lsr #1                  ;src uv_width
-    mov             r6, r6, lsr #1                  ;src uv_stride
-    mov             r7, r7, lsr #1                  ;dst uv_stride
-
-    mov             r1, #2
-
-cp_uv_loop
-
-    ;copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r10, r2, r6
-    add             r11, r3, r7
-    movs            r12, r5, lsr #6
-    ble             extra_uv_cp_needed
-
-cp_src_to_dst_width_uv_loop
-    vld1.8          {q0, q1}, [r8]!
-    vld1.8          {q8, q9}, [r10]!
-    vld1.8          {q2, q3}, [r8]!
-    vld1.8          {q10, q11}, [r10]!
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r9]!
-    vst1.8          {q8, q9}, [r11]!
-    vst1.8          {q2, q3}, [r9]!
-    vst1.8          {q10, q11}, [r11]!
-
-    bne             cp_src_to_dst_width_uv_loop
-
-    subs            lr, lr, #1
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             cp_src_to_dst_height_uv_loop
-
-extra_uv_cp_needed
-    ands            r10, r5, #0x3f                  ;check to see if extra copy is needed
-    sub             r11, r5, r10
-    ldr             r2, [sp]        ;srcptr1
-    ldr             r3, [sp, #4]        ;dstptr1
-    bne             extra_cp_src_to_dst_uv_width
-end_of_cp_src_to_dst_uv
-
-    subs            r1, r1, #1
-
-    addne               sp, sp, #8
-
-    ldrne               r2, [sp]        ;srcptr1
-    ldrne               r3, [sp, #4]        ;dstptr1
-
-    bne             cp_uv_loop
-
-    add             sp, sp, #8
-
-    vpop            {d8 - d15}
-    pop             {r4 - r11, pc}
-
-;=============================
-extra_cp_src_to_dst_width
-    add             r2, r2, r11
-    add             r3, r3, r11
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             lr, r4, lsr #1
-extra_cp_src_to_dst_height_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             r12, r10
-
-extra_cp_src_to_dst_width_loop
-    vld1.8          {q0}, [r8]!
-    vld1.8          {q1}, [r0]!
-
-    subs            r12, r12, #16
-
-    vst1.8          {q0}, [r9]!
-    vst1.8          {q1}, [r11]!
-    bne             extra_cp_src_to_dst_width_loop
-
-    subs            lr, lr, #1
-
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             extra_cp_src_to_dst_height_loop
-
-    b               end_of_cp_src_to_dst
-
-;=================================
-extra_cp_src_to_dst_uv_width
-    add             r2, r2, r11
-    add             r3, r3, r11
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             lr, r4, lsr #1
-extra_cp_src_to_dst_height_uv_loop
-    mov             r8, r2
-    mov             r9, r3
-    add             r0, r8, r6
-    add             r11, r9, r7
-
-    mov             r12, r10
-
-extra_cp_src_to_dst_width_uv_loop
-    vld1.8          {d0}, [r8]!
-    vld1.8          {d1}, [r0]!
-
-    subs            r12, r12, #8
-
-    vst1.8          {d0}, [r9]!
-    vst1.8          {d1}, [r11]!
-    bne             extra_cp_src_to_dst_width_uv_loop
-
-    subs            lr, lr, #1
-
-    add             r2, r2, r6, lsl #1
-    add             r3, r3, r7, lsl #1
-
-    bne             extra_cp_src_to_dst_height_uv_loop
-
-    b               end_of_cp_src_to_dst_uv
-
-    ENDP
-    END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
deleted file mode 100644
index d3306b669..000000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_copy_src_frame_func_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: This function is used to copy source data in src_buffer[i] at beginning
-;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
-;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
-
-;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-;                                       YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_src_frame_func_neon| PROC
-    push            {r4 - r11, lr}
-    vpush           {d8 - d15}
-
-    ;Copy Y plane
-    ldr             r4, [r0, #yv12_buffer_config_y_height]
-    ldr             r5, [r0, #yv12_buffer_config_y_width]
-    ldr             r6, [r0, #yv12_buffer_config_y_stride]
-    ldr             r7, [r1, #yv12_buffer_config_y_stride]
-    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
-
-    add             r10, r2, r6             ;second row src
-    add             r11, r3, r7             ;second row dst
-    mov             r6, r6, lsl #1
-    mov             r7, r7, lsl #1
-    sub             r6, r6, r5              ;adjust stride
-    sub             r7, r7, r5
-
-    ; copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
-    mov             r12, r5
-
-cp_width_128_loop
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q4, q5}, [r10]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q6, q7}, [r10]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q12, q13}, [r10]!
-    vld1.8          {q10, q11}, [r2]!
-    vld1.8          {q14, q15}, [r10]!
-    sub             r12, r12, #128
-    cmp             r12, #128
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q4, q5}, [r11]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q6, q7}, [r11]!
-    vst1.8          {q8, q9}, [r3]!
-    vst1.8          {q12, q13}, [r11]!
-    vst1.8          {q10, q11}, [r3]!
-    vst1.8          {q14, q15}, [r11]!
-    bhs             cp_width_128_loop
-
-    cmp             r12, #0
-    beq             cp_width_done
-
-cp_width_8_loop
-    vld1.8          {d0}, [r2]!
-    vld1.8          {d1}, [r10]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    vst1.8          {d1}, [r11]!
-    bhs             cp_width_8_loop
-
-    cmp             r12, #0
-    beq             cp_width_done
-
-cp_width_1_loop
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    ldrb            r8, [r10], #1
-    strb            r8, [r11], #1
-    bne             cp_width_1_loop
-
-cp_width_done
-    subs            lr, lr, #1
-    add             r2, r2, r6
-    add             r3, r3, r7
-    add             r10, r10, r6
-    add             r11, r11, r7
-    bne             cp_src_to_dst_height_loop
-
-;copy last line for Y if y_height is odd
-    tst             r4, #1
-    beq             cp_width_done_1
-    mov             r12, r5
-
-cp_width_128_loop_1
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q10, q11}, [r2]!
-    sub             r12, r12, #128
-    cmp             r12, #128
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q8, q9}, [r3]!
-    vst1.8          {q10, q11}, [r3]!
-    bhs             cp_width_128_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_done_1
-
-cp_width_8_loop_1
-    vld1.8          {d0}, [r2]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    bhs             cp_width_8_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_done_1
-
-cp_width_1_loop_1
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    bne             cp_width_1_loop_1
-cp_width_done_1
-
-;Copy U & V planes
-    ldr             r4, [r0, #yv12_buffer_config_uv_height]
-    ldr             r5, [r0, #yv12_buffer_config_uv_width]
-    ldr             r6, [r0, #yv12_buffer_config_uv_stride]
-    ldr             r7, [r1, #yv12_buffer_config_uv_stride]
-    ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
-    ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1
-
-    add             r10, r2, r6             ;second row src
-    add             r11, r3, r7             ;second row dst
-    mov             r6, r6, lsl #1
-    mov             r7, r7, lsl #1
-    sub             r6, r6, r5              ;adjust stride
-    sub             r7, r7, r5
-
-    mov             r9, #2
-
-cp_uv_loop
-    ;copy two rows at one time
-    mov             lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
-    mov             r12, r5
-
-cp_width_uv_64_loop
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q4, q5}, [r10]!
-    vld1.8          {q2, q3}, [r2]!
-    vld1.8          {q6, q7}, [r10]!
-    sub             r12, r12, #64
-    cmp             r12, #64
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q4, q5}, [r11]!
-    vst1.8          {q2, q3}, [r3]!
-    vst1.8          {q6, q7}, [r11]!
-    bhs             cp_width_uv_64_loop
-
-    cmp             r12, #0
-    beq             cp_width_uv_done
-
-cp_width_uv_8_loop
-    vld1.8          {d0}, [r2]!
-    vld1.8          {d1}, [r10]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    vst1.8          {d1}, [r11]!
-    bhs             cp_width_uv_8_loop
-
-    cmp             r12, #0
-    beq             cp_width_uv_done
-
-cp_width_uv_1_loop
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    ldrb            r8, [r10], #1
-    strb            r8, [r11], #1
-    bne             cp_width_uv_1_loop
-
-cp_width_uv_done
-    subs            lr, lr, #1
-    add             r2, r2, r6
-    add             r3, r3, r7
-    add             r10, r10, r6
-    add             r11, r11, r7
-    bne             cp_src_to_dst_height_uv_loop
-
-;copy last line for U & V if uv_height is odd
-    tst             r4, #1
-    beq             cp_width_uv_done_1
-    mov             r12, r5
-
-cp_width_uv_64_loop_1
-    vld1.8          {q0, q1}, [r2]!
-    vld1.8          {q2, q3}, [r2]!
-    sub             r12, r12, #64
-    cmp             r12, #64
-    vst1.8          {q0, q1}, [r3]!
-    vst1.8          {q2, q3}, [r3]!
-    bhs             cp_width_uv_64_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_uv_done_1
-
-cp_width_uv_8_loop_1
-    vld1.8          {d0}, [r2]!
-    sub             r12, r12, #8
-    cmp             r12, #8
-    vst1.8          {d0}, [r3]!
-    bhs             cp_width_uv_8_loop_1
-
-    cmp             r12, #0
-    beq             cp_width_uv_done_1
-
-cp_width_uv_1_loop_1
-    ldrb            r8, [r2], #1
-    subs            r12, r12, #1
-    strb            r8, [r3], #1
-    bne             cp_width_uv_1_loop_1
-cp_width_uv_done_1
-
-    subs            r9, r9, #1
-    ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
-    ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
-    ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
-    ldrne           r11, [r1, #yv12_buffer_config_uv_stride]
-
-    addne           r10, r2, r10                ;second row src
-    addne           r11, r3, r11                ;second row dst
-
-    bne             cp_uv_loop
-
-    vpop            {d8 - d15}
-    pop             {r4 - r11, pc}
-
-    ENDP
-    END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
deleted file mode 100644
index b2eb9eb0f..000000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
+++ /dev/null
@@ -1,308 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_yv12_extend_frame_borders_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_scale_asm_offsets.asm
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
-; we depend on VP8BORDERINPIXELS being 32
-
-|vp8_yv12_extend_frame_borders_neon| PROC
-    push            {r4 - r10, lr}
-    vpush           {d8 - d15}
-
-    ; Border = 32
-    ldr             r3, [r0, #yv12_buffer_config_y_width]  ; plane_width
-    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
-    ldr             r4, [r0, #yv12_buffer_config_y_height] ; plane_height
-    ldr             lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
-
-; Border copy for Y plane
-; copy the left and right most columns out
-    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
-    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
-    sub             r5, r1, #32             ; dest_ptr1 = src_ptr1 - Border
-
-    mov             r12, r4, lsr #2         ; plane_height / 4
-
-copy_left_right_y
-    vld1.8          {d0[], d1[]}, [r1], lr
-    vld1.8          {d4[], d5[]}, [r2], lr
-    vld1.8          {d8[], d9[]}, [r1], lr
-    vld1.8          {d12[], d13[]}, [r2], lr
-    vld1.8          {d16[], d17[]}, [r1], lr
-    vld1.8          {d20[], d21[]}, [r2], lr
-    vld1.8          {d24[], d25[]}, [r1], lr
-    vld1.8          {d28[], d29[]}, [r2], lr
-
-    vmov            q1, q0
-    vmov            q3, q2
-    vmov            q5, q4
-    vmov            q7, q6
-    vmov            q9, q8
-    vmov            q11, q10
-    vmov            q13, q12
-    vmov            q15, q14
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0, q1}, [r5], lr
-    vst1.8          {q2, q3}, [r6], lr
-    vst1.8          {q4, q5}, [r5], lr
-    vst1.8          {q6, q7}, [r6], lr
-    vst1.8          {q8, q9}, [r5], lr
-    vst1.8          {q10, q11}, [r6], lr
-    vst1.8          {q12, q13}, [r5], lr
-    vst1.8          {q14, q15}, [r6], lr
-
-    bne             copy_left_right_y
-
-;Now copy the top and bottom source lines into each line of the respective borders
-    ldr             r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
-    mul             r8, r4, lr              ; plane_height * plane_stride
-
-    ; copy width is plane_stride
-    movs            r12, lr, lsr #7         ; plane_stride / 128
-
-    sub             r1, r1, #32             ; src_ptr1 = y_buffer - Border
-    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
-    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
-    sub             r5, r1, lr, asl #5      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    ble             extra_y_copy_needed     ; plane stride < 128
-
-copy_top_bottom_y
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-    vld1.8          {q4, q5}, [r1]!
-    vld1.8          {q12, q13}, [r2]!
-    vld1.8          {q6, q7}, [r1]!
-    vld1.8          {q14, q15}, [r2]!
-
-    mov             r7, #32                 ; Border
-
-top_bottom_32
-    subs            r7, r7, #1
-
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-    vst1.8          {q4, q5}, [r5]!
-    vst1.8          {q12, q13}, [r6]!
-    vst1.8          {q6, q7}, [r5]!
-    vst1.8          {q14, q15}, [r6]!
-
-    add             r5, r5, lr              ; dest_ptr1 += plane_stride
-    sub             r5, r5, #128            ; dest_ptr1 -= 128
-    add             r6, r6, lr              ; dest_ptr2 += plane_stride
-    sub             r6, r6, #128            ; dest_ptr2 -= 128
-
-    bne             top_bottom_32
-
-    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border* plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-
-    subs            r12, r12, #1
-    bne             copy_top_bottom_y
-
-extra_y_copy_needed
-    mov             r7, lr, lsr #4          ; check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_y
-end_of_border_copy_y
-
-;Border copy for U, V planes
-; Border = 16
-    ldr             r7, [r0, #yv12_buffer_config_u_buffer]  ; src_ptr1
-    ldr             lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
-    ldr             r3, [r0, #yv12_buffer_config_uv_width]  ; plane_width
-    ldr             r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
-
-    mov             r10, #2
-
-;copy the left and right most columns out
-border_copy_uv
-    mov             r1, r7                  ; src_ptr1 needs to be saved for second half of loop
-    sub             r5, r1, #16             ; dest_ptr1 = src_ptr1 - Border
-    add             r6, r1, r3              ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
-    sub             r2, r6, #1              ; src_ptr2 = src_ptr1 + plane_width - 1
-
-    mov             r12, r4, lsr #3         ; plane_height / 8
-
-copy_left_right_uv
-    vld1.8          {d0[], d1[]}, [r1], lr
-    vld1.8          {d2[], d3[]}, [r2], lr
-    vld1.8          {d4[], d5[]}, [r1], lr
-    vld1.8          {d6[], d7[]}, [r2], lr
-    vld1.8          {d8[], d9[]},  [r1], lr
-    vld1.8          {d10[], d11[]}, [r2], lr
-    vld1.8          {d12[], d13[]}, [r1], lr
-    vld1.8          {d14[], d15[]}, [r2], lr
-    vld1.8          {d16[], d17[]}, [r1], lr
-    vld1.8          {d18[], d19[]}, [r2], lr
-    vld1.8          {d20[], d21[]}, [r1], lr
-    vld1.8          {d22[], d23[]}, [r2], lr
-    vld1.8          {d24[], d25[]}, [r1], lr
-    vld1.8          {d26[], d27[]}, [r2], lr
-    vld1.8          {d28[], d29[]}, [r1], lr
-    vld1.8          {d30[], d31[]}, [r2], lr
-
-    subs            r12, r12, #1
-
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q1}, [r6], lr
-    vst1.8          {q2}, [r5], lr
-    vst1.8          {q3}, [r6], lr
-    vst1.8          {q4}, [r5], lr
-    vst1.8          {q5}, [r6], lr
-    vst1.8          {q6}, [r5], lr
-    vst1.8          {q7}, [r6], lr
-    vst1.8          {q8}, [r5], lr
-    vst1.8          {q9}, [r6], lr
-    vst1.8          {q10}, [r5], lr
-    vst1.8          {q11}, [r6], lr
-    vst1.8          {q12}, [r5], lr
-    vst1.8          {q13}, [r6], lr
-    vst1.8          {q14}, [r5], lr
-    vst1.8          {q15}, [r6], lr
-
-    bne             copy_left_right_uv
-
-;Now copy the top and bottom source lines into each line of the respective borders
-    mov             r1, r7
-    mul             r8, r4, lr              ; plane_height * plane_stride
-    movs            r12, lr, lsr #6         ; plane_stride / 64
-
-    sub             r1, r1, #16             ; src_ptr1 = u_buffer - Border
-    add             r6, r1, r8              ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
-    sub             r2, r6, lr              ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
-    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    ble             extra_uv_copy_needed    ; plane_stride < 64
-
-copy_top_bottom_uv
-    vld1.8          {q0, q1}, [r1]!
-    vld1.8          {q8, q9}, [r2]!
-    vld1.8          {q2, q3}, [r1]!
-    vld1.8          {q10, q11}, [r2]!
-
-    mov             r7, #16                 ; Border
-
-top_bottom_16
-    subs            r7, r7, #1
-
-    vst1.8          {q0, q1}, [r5]!
-    vst1.8          {q8, q9}, [r6]!
-    vst1.8          {q2, q3}, [r5]!
-    vst1.8          {q10, q11}, [r6]!
-
-    add             r5, r5, lr              ; dest_ptr1 += plane_stride
-    sub             r5, r5, #64
-    add             r6, r6, lr              ; dest_ptr2 += plane_stride
-    sub             r6, r6, #64
-
-    bne             top_bottom_16
-
-    sub             r5, r1, lr, asl #4      ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; dest_ptr2 = src_ptr2 + plane_stride
-
-    subs            r12, r12, #1
-    bne             copy_top_bottom_uv
-extra_uv_copy_needed
-    mov             r7, lr, lsr #3          ; check to see if extra copy is needed
-    ands            r7, r7, #0x7
-    bne             extra_top_bottom_uv
-
-end_of_border_copy_uv
-    subs            r10, r10, #1
-    ldrne           r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
-    bne             border_copy_uv
-
-    vpop            {d8 - d15}
-    pop             {r4 - r10, pc}
-
-;;;;;;;;;;;;;;;;;;;;;;
-extra_top_bottom_y
-    vld1.8          {q0}, [r1]!
-    vld1.8          {q2}, [r2]!
-
-    mov             r9, #4                  ; 32 >> 3
-
-extra_top_bottom_32
-    subs            r9, r9, #1
-
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    vst1.8          {q0}, [r5], lr
-    vst1.8          {q2}, [r6], lr
-    bne             extra_top_bottom_32
-
-    sub             r5, r1, lr, asl #5      ; src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-    subs            r7, r7, #1
-    bne             extra_top_bottom_y
-
-    b               end_of_border_copy_y
-
-extra_top_bottom_uv
-    vld1.8          {d0}, [r1]!
-    vld1.8          {d8}, [r2]!
-
-    mov             r9, #2                  ; 16 >> 3
-
-extra_top_bottom_16
-    subs            r9, r9, #1
-
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    vst1.8          {d0}, [r5], lr
-    vst1.8          {d8}, [r6], lr
-    bne             extra_top_bottom_16
-
-    sub             r5, r1, lr, asl #4      ; src_ptr1 - (Border * plane_stride)
-    add             r6, r2, lr              ; src_ptr2 + plane_stride
-    subs            r7, r7, #1
-    bne             extra_top_bottom_uv
-
-    b               end_of_border_copy_uv
-
-    ENDP
-    END
diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
deleted file mode 100644
index fac7bbc1b..000000000
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_scale_rtcd.h"
-
-extern void vp8_yv12_copy_frame_func_neon(
-    const struct yv12_buffer_config *src_ybc,
-    struct yv12_buffer_config *dst_ybc);
-
-void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
-                              struct yv12_buffer_config *dst_ybc) {
-  vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
-  vp8_yv12_extend_frame_borders_neon(dst_ybc);
-}
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index 92f8b85ad..de6ba1f44 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -84,6 +84,8 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     ybf->y_height = aligned_height;
     ybf->y_stride = y_stride;
 
+    ybf->uv_crop_width = (width + 1) / 2;
+    ybf->uv_crop_height = (height + 1) / 2;
     ybf->uv_width = uv_width;
     ybf->uv_height = uv_height;
     ybf->uv_stride = uv_stride;
@@ -216,7 +218,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
       if (!ybf->buffer_alloc)
         return -1;
 
-      ybf->buffer_alloc_sz = frame_size;
+      ybf->buffer_alloc_sz = (int)frame_size;
 
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
@@ -245,7 +247,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     ybf->uv_stride = uv_stride;
 
     ybf->border = border;
-    ybf->frame_size = frame_size;
+    ybf->frame_size = (int)frame_size;
 #if CONFIG_VP9_HIGH
     if (use_high) {
       // Store uint16 addresses when using 16bit framebuffers
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 1f8b5b7dc..76515fd12 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -103,6 +103,9 @@ static void extend_plane_high(uint8_t *const src8, int src_stride,
 #endif
 
 void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  const int uv_border = ybf->border / 2;
+
+  assert(ybf->border % 2 == 0);
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
@@ -137,16 +140,16 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
                ybf->border + ybf->y_width - ybf->y_crop_width);
 
   extend_plane(ybf->u_buffer, ybf->uv_stride,
-               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-               ybf->border / 2, ybf->border / 2,
-               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
 
   extend_plane(ybf->v_buffer, ybf->uv_stride,
-               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-               ybf->border / 2, ybf->border / 2,
-               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+               ybf->uv_crop_width, ybf->uv_crop_height,
+               uv_border, uv_border,
+               uv_border + ybf->uv_height - ybf->uv_crop_height,
+               uv_border + ybf->uv_width - ybf->uv_crop_width);
 }
 
 #if CONFIG_VP9
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 95e7483e6..0a1594bd8 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -9,12 +9,6 @@ SCALE_SRCS-yes += vpx_scale_asm_offsets.c
 SCALE_SRCS-yes += vpx_scale_rtcd.c
 SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
-#neon
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/vp8_vpxyv12_extendframeborders_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM)  += arm/neon/yv12extend_arm.c
-
 #mips(dspr2)
 SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
 
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 2e3f1ffbe..d4a2b81a5 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -17,12 +17,8 @@ if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
 }
 
 add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
-specialize qw/vp8_yv12_extend_frame_borders neon_asm/;
-$vp8_yv12_extend_frame_borders_neon_asm=vp8_yv12_extend_frame_borders_neon;
 
 add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_frame neon_asm/;
-$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon;
 
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
diff --git a/vpxdec.c b/vpxdec.c
index 6a4685045..231b3468d 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -131,8 +131,8 @@ static const arg_def_t *vp8_pp_args[] = {
 };
 #endif
 
-static int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
-                           FilterModeEnum mode) {
+static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+                                  FilterModeEnum mode) {
 #if CONFIG_VP9_HIGH
   if (src->fmt == VPX_IMG_FMT_I42016) {
     assert(dst->fmt == VPX_IMG_FMT_I42016);
@@ -458,6 +458,7 @@ void generate_filename(const char *pattern, char *out, size_t q_len,
           break;
         default:
           die("Unrecognized pattern %%%c\n", p[1]);
+          break;
       }
 
       pat_len = strlen(q);
@@ -716,7 +717,7 @@ int main_loop(int argc, const char **argv_) {
   int                     use_y4m = 1;
   int                     opt_yv12 = 0;
   int                     opt_i420 = 0;
-  vpx_codec_dec_cfg_t     cfg = {0};
+  vpx_codec_dec_cfg_t     cfg = {0, 0, 0};
 #if CONFIG_VP9_HIGH
   int                     out_bit_depth = 0;
 #endif
@@ -736,7 +737,7 @@ int main_loop(int argc, const char **argv_) {
 #endif
   int                     frame_avail, got_data;
   int                     num_external_frame_buffers = 0;
-  struct ExternalFrameBufferList ext_fb_list = {0};
+  struct ExternalFrameBufferList ext_fb_list = {0, NULL};
 
   const char *outfile_pattern = NULL;
   char outfile_name[PATH_MAX] = {0};
@@ -745,8 +746,8 @@ int main_loop(int argc, const char **argv_) {
   MD5Context md5_ctx;
   unsigned char md5_digest[16];
 
-  struct VpxDecInputContext input = {0};
-  struct VpxInputContext vpx_input_ctx = {0};
+  struct VpxDecInputContext input = {NULL, NULL};
+  struct VpxInputContext vpx_input_ctx;
 #if CONFIG_WEBM_IO
   struct WebmInputContext webm_ctx = {0};
   input.webm_ctx = &webm_ctx;
diff --git a/vpxenc.c b/vpxenc.c
index 2d1e41d61..675874e6a 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -408,11 +408,20 @@ static const arg_def_t inbitdeptharg = ARG_DEF(NULL, "input-bit-depth", 1,
                                                "Bit depth of input");
 #endif
 
+static const struct arg_enum_list tune_content_enum[] = {
+  {"default", VP9E_CONTENT_DEFAULT},
+  {"screen", VP9E_CONTENT_SCREEN},
+  {NULL, 0}
+};
+
+static const arg_def_t tune_content = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
 static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
   &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
-  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
+  &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, &tune_content,
 #if CONFIG_VP9_HIGH
   &bitdeptharg, &inbitdeptharg,
 #endif
@@ -425,7 +434,7 @@ static const int vp9_arg_ctrl_map[] = {
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
-  VP9E_SET_FRAME_PERIODIC_BOOST,
+  VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_TUNE_CONTENT,
   0
 };
 #endif
diff --git a/y4menc.c b/y4menc.c
index 9211452a4..b647e8dcc 100644
--- a/y4menc.c
+++ b/y4menc.c
@@ -48,6 +48,7 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
               "C420p16 XYSCSS=420P16\n";
       break;
     default:
+      color = NULL;
       assert(0);
   }
   return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height,