update libyuv to r1456

picks up build warning fixes for visual studio 2015 Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520
2015-07-24 16:54:51 -07:00
parent f42012e526
commit fcb4253c9c
46 changed files with 5400 additions and 2955 deletions
--- a/examples.mk
+++ b/examples.mk
@@ -22,17 +22,18 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                third_party/libyuv/source/planar_functions.cc \
                third_party/libyuv/source/row_any.cc \
                third_party/libyuv/source/row_common.cc \
                third_party/libyuv/source/row_gcc.cc \
                third_party/libyuv/source/row_mips.cc \
                third_party/libyuv/source/row_neon.cc \
                third_party/libyuv/source/row_neon64.cc \
                third_party/libyuv/source/row_posix.cc \
                third_party/libyuv/source/row_win.cc \
                third_party/libyuv/source/scale.cc \
                third_party/libyuv/source/scale_any.cc \
                third_party/libyuv/source/scale_common.cc \
                third_party/libyuv/source/scale_gcc.cc \
                third_party/libyuv/source/scale_mips.cc \
                third_party/libyuv/source/scale_neon.cc \
                third_party/libyuv/source/scale_neon64.cc \
                third_party/libyuv/source/scale_posix.cc \
                third_party/libyuv/source/scale_win.cc \
 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1305
+Version: 1456
 License: BSD
 License File: LICENSE
@@ -13,4 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times
 in order to encode multiple resolution bit streams.
 Local Modifications:
 cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'
--- a/third_party/libyuv/include/libyuv/convert.h
+++ b/third_party/libyuv/include/libyuv/convert.h
@@ -71,6 +71,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 #define J400ToJ420 I400ToI420
 // Convert NV12 to I420.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
--- a/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/third_party/libyuv/include/libyuv/convert_argb.h
@@ -68,20 +68,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
-// Convert I400 (grey) to ARGB.
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
 int I400ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);
-// Alias.
+// Convert J400 (jpeg grey) to ARGB.
 #define YToARGB I400ToARGB_Reference
 // Convert I400 to ARGB. Reverse of ARGBToI400.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+int J400ToARGB(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
-                         int width, int height);
+               int width, int height);
 // Alias.
 #define YToARGB I400ToARGB
 // Convert NV12 to ARGB.
 LIBYUV_API
--- a/third_party/libyuv/include/libyuv/convert_from.h
+++ b/third_party/libyuv/include/libyuv/convert_from.h
@@ -137,6 +137,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
                 uint8* dst_frame, int dst_stride_frame,
                 int width, int height);
 // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 LIBYUV_API
 int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
                       const uint8* src_u, int src_stride_u,
                       const uint8* src_v, int src_stride_v,
                       uint8* dst_frame, int dst_stride_frame,
                       const uint8* dither4x4, int width, int height);
 LIBYUV_API
 int I420ToARGB1555(const uint8* src_y, int src_stride_y,
                   const uint8* src_u, int src_stride_u,
--- a/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -61,12 +61,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
                 int width, int height);
-// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
-// Values in dither matrix from 0 to 255.  128 is best for no dither.
+// Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 // TODO(fbarchard): Consider pointer to 2d array for dither4x4.
 // const uint8(*dither)[4][4];
 LIBYUV_API
 int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither8x8, int width, int height);
+                       const uint8* dither4x4, int width, int height);
 // Convert ARGB To ARGB1555.
 LIBYUV_API
@@ -140,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 LIBYUV_API
 int ARGBToG(const uint8* src_argb, int src_stride_argb,
            uint8* dst_g, int dst_stride_g,
            int width, int height);
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
--- a/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -45,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
 #define J400ToJ400 I400ToI400
 // Copy I422 to I422.
 #define I422ToI422 I422Copy
@@ -84,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 LIBYUV_API
 int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_uv, int dst_stride_uv,
               int width, int height);
 LIBYUV_API
 int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_uv, int dst_stride_uv,
               int width, int height);
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
@@ -93,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
               int width, int height);
 // Alias
 #define J420ToJ400 I420ToI400
 #define I420ToI420Mirror I420Mirror
 // I420 mirror.
@@ -387,24 +401,24 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height, int interpolation);
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+#if defined(__pnacl__) || defined(__CLR_VER) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_ARGBAFFINEROW_SSE2
 #endif
-// Row functions for copying a pixels from a source with a slope to a row
+// Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
 #define HAS_ARGBAFFINEROW_SSE2
 #endif  // LIBYUV_DISABLE_X86
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
--- a/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/third_party/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,138 @@
 /*
 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
 #define INCLUDE_LIBYUV_ROTATE_ROW_H_
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #if defined(__pnacl__) || defined(__CLR_VER) || \
    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
 // Visual C 2012 required for AVX2.
 #if defined(_M_IX86) && !defined(__clang__) && \
    defined(_MSC_VER) && _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 // TODO(fbarchard): switch to standard form of inline; fails on clangcl.
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".private_extern _" #name "                \n"                             \
    ".align 4,0x90                             \n"                             \
 "_" #name ":                                   \n"
 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".align 4,0x90                             \n"                             \
 "_" #name ":                                   \n"
 #else
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".align 4,0x90                             \n"                             \
 #name ":                                       \n"
 #endif
 #endif
 // The following are available for Visual C:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    defined(_MSC_VER) && !defined(__clang__)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 // The following are available for GCC but not NaCL:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 #define HAS_TRANSPOSEWX8_SSSE3
 #endif
 // The following are available for 32 bit GCC:
 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 // The following are available for 64 bit GCC but not NaCL:
 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
    defined(__x86_64__)
 #define HAS_TRANSPOSEWX8_FAST_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    defined(__mips__) && \
    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_TRANSPOSEWX8_MIPS_DSPR2
 #define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
 #endif  // defined(__mips__)
 void TransposeWxH_C(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride, int width, int height);
 void TransposeWx8_C(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride, int width);
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width);
 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                        uint8* dst, int dst_stride, int width);
 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
                             uint8* dst, int dst_stride, int width);
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                             uint8* dst, int dst_stride, int width);
 void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride, int width);
 void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
                            uint8* dst, int dst_stride, int width);
 void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
                                 uint8* dst, int dst_stride, int width);
 void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
                                 uint8* dst, int dst_stride, int width);
 void TransposeUVWxH_C(const uint8* src, int src_stride,
                      uint8* dst_a, int dst_stride_a,
                      uint8* dst_b, int dst_stride_b,
                      int width, int height);
 void TransposeUVWx8_C(const uint8* src, int src_stride,
                      uint8* dst_a, int dst_stride_a,
                      uint8* dst_b, int dst_stride_b, int width);
 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b, int width);
 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b, int width);
 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                               uint8* dst_a, int dst_stride_a,
                               uint8* dst_b, int dst_stride_b, int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 #endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -37,10 +37,8 @@ extern "C" {
  free(var##_mem);  \
  var = 0
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+#if defined(__pnacl__) || defined(__CLR_VER) || \
-    defined(TARGET_IPHONE_SIMULATOR) || \
+    (defined(__i386__) && !defined(__SSE2__))
    (defined(__i386__) && !defined(__SSE2__)) || \
    (defined(_MSC_VER) && defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -48,6 +46,9 @@ extern "C" {
 #define LIBYUV_SSSE3_ONLY
 #endif
 #if defined(__native_client__)
 #define LIBYUV_DISABLE_NEON
 #endif
 // clang >= 3.5.0 required for Arm64.
 #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
 #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -63,11 +64,11 @@ extern "C" {
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_ARGBSETROW_X86
 #define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTOBAYERGGROW_SSE2
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
@@ -95,7 +96,8 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
-// #define HAS_J422TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
 #define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3
@@ -112,15 +114,13 @@ extern "C" {
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
 #define HAS_SETROW_X86
 #define HAS_SETROW_ERMS
-#define HAS_ARGBSETROW_X86
+#define HAS_SETROW_X86
 #define HAS_SPLITUVROW_SSE2
 #define HAS_UYVYTOARGBROW_SSSE3
 #define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
 #define HAS_UYVYTOYROW_SSE2
 #define HAS_YTOARGBROW_SSE2
 #define HAS_YUY2TOARGBROW_SSSE3
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
@@ -157,8 +157,9 @@ extern "C" {
 #define HAS_SOBELYROW_SSE2
 #endif
-// The following are available on x64 Visual C:
+// The following are available on x64 Visual C and clangcl.
-#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
    (!defined(__clang__) || defined(__SSSE3__))
 #define HAS_I422TOARGBROW_SSSE3
 #endif
@@ -177,27 +178,31 @@ extern "C" {
 #endif  // __clang__
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && \
    defined(_MSC_VER) && _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 // The following are available require VS2012.  Port to GCC.
 #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
-// TODO(fbarchard): fix AVX2 versions of YUV conversion.  bug=393
+#define HAS_ARGB1555TOARGBROW_AVX2
-#define HAS_I422TOABGRROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
 #define HAS_I422TOBGRAROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_ARGBTORGB565ROW_AVX2
 #define HAS_ARGBTOARGB1555ROW_AVX2
 #define HAS_ARGBTOARGB4444ROW_AVX2
-#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
-#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
-#define HAS_I422TORGB565ROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
 #define HAS_I411TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TORGB565ROW_AVX2
 #define HAS_I444TOARGBROW_AVX2
 #define HAS_J400TOARGBROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
 #define HAS_NV21TORGB565ROW_AVX2
 #define HAS_RGB565TOARGBROW_AVX2
 #endif
 // The following are available on all x86 platforms, but
@@ -214,24 +219,27 @@ extern "C" {
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_COPYROW_AVX
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I422TOABGRROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
 #define HAS_I422TOBGRAROW_AVX2
 #define HAS_I422TORAWROW_AVX2
 #define HAS_I422TORGB24ROW_AVX2
 #define HAS_I422TORGBAROW_AVX2
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_J422TOARGBROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
 #define HAS_UYVYTOUVROW_AVX2
 #define HAS_UYVYTOYROW_AVX2
-#define HAS_YTOARGBROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
 // The following require HAS_I422TOARGBROW_AVX2
 #if defined(HAS_I422TOARGBROW_AVX2)
 #define HAS_YUY2TOARGBROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #endif
 // Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
@@ -240,22 +248,6 @@ extern "C" {
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #endif
 // The following are Yasm x86 only:
 // TODO(fbarchard): Port AVX2 to inline.
 #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
    (defined(_M_IX86) || defined(_M_X64) || \
    defined(__x86_64__) || defined(__i386__))
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MERGEUVROW_MMX
 #define HAS_SPLITUVROW_AVX2
 #define HAS_SPLITUVROW_MMX
 #define HAS_UYVYTOYROW_AVX2
 #define HAS_UYVYTOYROW_MMX
 #define HAS_YUY2TOYROW_AVX2
 #define HAS_YUY2TOYROW_MMX
 #endif
 // The following are disabled when SSSE3 is available:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
@@ -278,7 +270,6 @@ extern "C" {
 #define HAS_ARGB4444TOYROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTOBAYERGGROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
@@ -292,7 +283,7 @@ extern "C" {
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_COPYROW_NEON
-#define HAS_I400TOARGBROW_NEON
+#define HAS_J400TOARGBROW_NEON
 #define HAS_I411TOARGBROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@@ -331,11 +322,12 @@ extern "C" {
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
-#define HAS_YTOARGBROW_NEON
+#define HAS_I400TOARGBROW_NEON
 #define HAS_YUY2TOARGBROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 #define HAS_ARGBTORGB565DITHERROW_NEON
 // Effects:
 #define HAS_ARGBADDROW_NEON
@@ -388,7 +380,6 @@ typedef __declspec(align(32)) int8 lvec8[32];
 typedef __declspec(align(32)) uint16 ulvec16[16];
 typedef __declspec(align(32)) uint32 ulvec32[8];
 typedef __declspec(align(32)) uint8 ulvec8[32];
 #elif defined(__GNUC__)
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
@@ -869,6 +860,11 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix);
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix);
 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
                            int pix);
 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
                            int pix);
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
@@ -884,12 +880,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
                              int pix);
 void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
                                int pix);
 void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                                int pix);
 void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
                              int pix);
 void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
                                int pix);
 void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
                                int pix);
 void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
@@ -905,6 +909,13 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
                             const uint32 dither4, int pix);
 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int pix);
 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int pix);
 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -914,6 +925,8 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int width);
 void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -922,14 +935,13 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-                             const uint8* dither8x8, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
-
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 void I444ToARGBRow_C(const uint8* src_y,
                     const uint8* src_u,
@@ -1038,6 +1050,11 @@ void I444ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
 void I444ToARGBRow_AVX2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width);
 void I422ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -1048,6 +1065,11 @@ void I411ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
 void I411ToARGBRow_AVX2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width);
 void NV12ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_uv,
                         uint8* dst_argb,
@@ -1097,6 +1119,11 @@ void J422ToARGBRow_SSSE3(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
 void J422ToARGBRow_AVX2(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_argb,
                        int width);
 void I422ToBGRARow_SSSE3(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -1147,11 +1174,21 @@ void I422ToRGB24Row_SSSE3(const uint8* src_y,
                          const uint8* src_v,
                          uint8* dst_rgb24,
                          int width);
 void I422ToRGB24Row_AVX2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_rgb24,
                         int width);
 void I422ToRAWRow_SSSE3(const uint8* src_y,
                        const uint8* src_u,
                        const uint8* src_v,
                        uint8* dst_raw,
                        int width);
 void I422ToRAWRow_AVX2(const uint8* src_y,
                       const uint8* src_u,
                       const uint8* src_v,
                       uint8* dst_raw,
                       int width);
 void I422ToARGBRow_Any_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
@@ -1177,6 +1214,11 @@ void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
 void I444ToARGBRow_Any_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
 void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
@@ -1187,6 +1229,11 @@ void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
 void I411ToARGBRow_Any_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
 void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_uv,
                             uint8* dst_argb,
@@ -1231,6 +1278,16 @@ void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
 void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
                            uint8* dst_argb,
                            int width);
 void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
 void J422ToARGBRow_Any_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
 void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
@@ -1281,33 +1338,29 @@ void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
 void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
 void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
 void I422ToRAWRow_Any_AVX2(const uint8* src_y,
                           const uint8* src_u,
                           const uint8* src_v,
                           uint8* dst_argb,
                           int width);
-void YToARGBRow_C(const uint8* src_y,
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-                  uint8* dst_argb,
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-                  int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void YToARGBRow_SSE2(const uint8* src_y,
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-                     uint8* dst_argb,
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-                     int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void YToARGBRow_AVX2(const uint8* src_y,
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
                     uint8* dst_argb,
                     int width);
 void YToARGBRow_NEON(const uint8* src_y,
                     uint8* dst_argb,
                     int width);
 void YToARGBRow_Any_SSE2(const uint8* src_y,
                         uint8* dst_argb,
                         int width);
 void YToARGBRow_Any_AVX2(const uint8* src_y,
                         uint8* dst_argb,
                         int width);
 void YToARGBRow_Any_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width);
 // ARGB preattenuated alpha blend.
 void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@@ -1375,6 +1428,11 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
                                    const uint32 dither4, int pix);
 void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
                                    const uint32 dither4, int pix);
 void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1384,6 +1442,8 @@ void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
                                    const uint32 dither4, int width);
 void I444ToARGBRow_Any_NEON(const uint8* src_y,
                            const uint8* src_u,
@@ -1570,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
                             uint8* dst_u, uint8* dst_v, int pix);
 void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
                        uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
                               uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
                               uint32 /* selector */, int pix);
 void I422ToYUY2Row_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
@@ -1770,6 +1819,18 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width);
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width);
 void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                       uint8* dst_argb, int width);
 void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                       uint8* dst_argb, int width);
 void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                              uint8* dst_y, int width);
 void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                              uint8* dst_y, int width);
 void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                         uint8* dst_argb, int width);
 void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                         uint8* dst_argb, int width);
 void ARGBPolynomialRow_C(const uint8* src_argb,
                         uint8* dst_argb, const float* poly,
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -12,45 +12,66 @@
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 #include "libyuv/basic_types.h"
 #include "libyuv/scale.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
-#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+#if defined(__pnacl__) || defined(__CLR_VER) || \
-    defined(TARGET_IPHONE_SIMULATOR)
+    (defined(__i386__) && !defined(__SSE2__))
 #define LIBYUV_DISABLE_X86
 #endif
 // Visual C 2012 required for AVX2.
 #if defined(_M_IX86) && !defined(__clang__) && \
    defined(_MSC_VER) && _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_FIXEDDIV1_X86
-#define HAS_SCALEROWDOWN4_SSE2
+#define HAS_FIXEDDIV_X86
-#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEARGBCOLS_SSE2
-#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEARGBCOLSUP2_SSE2
-#define HAS_SCALEADDROWS_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
 #define HAS_SCALEFILTERCOLS_SSSE3
 #define HAS_SCALECOLSUP2_SSE2
 #define HAS_SCALEARGBROWDOWN2_SSE2
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
-#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALECOLSUP2_SSE2
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEFILTERCOLS_SSSE3
-#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEROWDOWN2_SSE2
-#define HAS_FIXEDDIV_X86
+#define HAS_SCALEROWDOWN34_SSSE3
-#define HAS_FIXEDDIV1_X86
+#define HAS_SCALEROWDOWN38_SSSE3
 #define HAS_SCALEROWDOWN4_SSE2
 #endif
 // The following are available on VS2012:
 #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
 #define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif
 // The following are available on Visual C:
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
 #define HAS_SCALEADDROW_SSE2
 #endif
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEFILTERCOLS_NEON
 #define HAS_SCALEROWDOWN2_NEON
 #define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
 #endif
 // The following are available on Mips platforms:
@@ -164,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
                               uint16* dst_ptr, int dst_width);
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-                    uint16* dst_ptr, int src_width, int src_height);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
 void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
                       uint32* dst_ptr, int src_width, int src_height);
 void ScaleARGBRowDown2_C(const uint8* src_argb,
                         ptrdiff_t src_stride,
                         uint8* dst_argb, int dst_width);
@@ -194,16 +213,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
 void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
                             int dst_width, int x, int dx);
 // Specialized scalers for x86.
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@@ -220,46 +251,124 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width);
-void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                       uint16* dst_ptr, int src_width,
+                            uint8* dst_ptr, int dst_width);
-                       int src_height);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width);
 void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
                                    ptrdiff_t src_stride,
                                    uint8* dst_ptr, int dst_width);
 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx);
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx);
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+
-                            ptrdiff_t src_stride,
+
-                            uint8* dst_argb, int dst_width);
+// ARGB Column functions
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                               ptrdiff_t src_stride,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx);
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx);
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int x, int dx);
-// Row functions.
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx);
 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx);
 void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
                                  int dst_width, int x, int dx);
 void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx);
 // ARGB Row functions
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                            uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width);
 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
 void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
                                      ptrdiff_t src_stride,
                                      uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                   uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width);
 void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
                                      ptrdiff_t src_stride,
                                      uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst, int dst_width);
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx, uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  int src_stepx,
                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
+                                   int src_stepx,
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
-                               uint8* dst, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
                                      ptrdiff_t src_stride,
                                      int src_stepx,
                                      uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width);
 void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
                                      ptrdiff_t src_stride,
                                      int src_stepx,
                                      uint8* dst_argb, int dst_width);
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
@@ -267,7 +376,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst, int dst_width);
-
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst, int dst_width);
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);
@@ -302,6 +412,42 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width);
 void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst, int dst_width);
 void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width);
 void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                             uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width);
 void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                   uint8* dst_ptr, int dst_width);
 // 32 -> 12
 void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                             uint8* dst_ptr, int dst_width);
 // 32x3 -> 12x1
 void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 // 32x2 -> 12x1
 void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx);
 void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
                              int dst_width, int x, int dx);
 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst, int dst_width);
 void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
--- a/third_party/libyuv/include/libyuv/version.h
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1305
+#define LIBYUV_VERSION 1456
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/third_party/libyuv/source/compare.cc
+++ b/third_party/libyuv/source/compare.cc
@@ -37,7 +37,7 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
 #define HAS_HASHDJB2_SSE41
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-#if _MSC_VER >= 1700
+#ifdef VISUALC_HAS_AVX2
 #define HAS_HASHDJB2_AVX2
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
 #endif
@@ -138,8 +138,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
-// Visual C 2012 required for AVX2.
+
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+#ifdef VISUALC_HAS_AVX2
 #define HAS_SUMSQUAREERROR_AVX2
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
 #endif
--- a/third_party/libyuv/source/compare_posix.cc
+++ b/third_party/libyuv/source/compare_posix.cc
--- a/third_party/libyuv/source/compare_neon64.cc
+++ b/third_party/libyuv/source/compare_neon64.cc
@@ -32,7 +32,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
    "ld1        {v0.16b}, [%0], #16            \n"
    MEMACCESS(1)
    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %2, %2, #16                    \n"
+    "subs       %w2, %w2, #16                  \n"
    "usubl      v2.8h, v0.8b, v1.8b            \n"
    "usubl2     v3.8h, v0.16b, v1.16b          \n"
    "smlal      v16.4s, v2.4h, v2.4h           \n"
--- a/third_party/libyuv/source/compare_win.cc
+++ b/third_party/libyuv/source/compare_win.cc
@@ -16,9 +16,11 @@ namespace libyuv {
 extern "C" {
 #endif
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C x86.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    defined(_MSC_VER) && !defined(__clang__)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
@@ -59,7 +61,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
 #pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
  __asm {
    mov        eax, [esp + 4]    // src_a
@@ -133,7 +135,7 @@ static uvec32 kHashMul3 = {
 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
    _asm _emit 0x40 _asm _emit reg
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
@@ -184,7 +186,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  __asm {
    mov        eax, [esp + 4]    // src
@@ -219,8 +221,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
  }
 }
 #endif  // _MSC_VER >= 1700
-
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #ifdef __cplusplus
 }  // extern "C"
--- a/third_party/libyuv/source/convert.cc
+++ b/third_party/libyuv/source/convert.cc
@@ -817,22 +817,20 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
    src_stride_rgb24 = -src_stride_rgb24;
  }
 // Neon version does direct RGB24 to YUV.
 #if defined(HAS_RGB24TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
    RGB24ToYRow = RGB24ToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      RGB24ToYRow = RGB24ToYRow_NEON;
      if (IS_ALIGNED(width, 16)) {
        RGB24ToUVRow = RGB24ToUVRow_NEON;
      }
    }
  }
-#endif
+// Other platforms do intermediate conversion from RGB24 to ARGB.
-#if defined(HAS_RGB24TOUVROW_NEON)
+#else
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      RGB24ToUVRow = RGB24ToUVRow_NEON;
    }
  }
 #endif
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -841,27 +839,29 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif
-
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
  {
 #if !defined(HAS_RGB24TOYROW_NEON)
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
+    const int kRowSize = (width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
 #endif
@@ -894,8 +894,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
    }
 #if !defined(HAS_RGB24TOYROW_NEON)
    free_aligned_buffer_64(row);
 #endif
  }
 #endif
  return 0;
 }
@@ -931,22 +931,20 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
    src_stride_raw = -src_stride_raw;
  }
 // Neon version does direct RAW to YUV.
 #if defined(HAS_RAWTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RAWToUVRow = RAWToUVRow_Any_NEON;
    RAWToYRow = RAWToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      RAWToYRow = RAWToYRow_NEON;
      if (IS_ALIGNED(width, 16)) {
        RAWToUVRow = RAWToUVRow_NEON;
      }
    }
  }
-#endif
+// Other platforms do intermediate conversion from RAW to ARGB.
-#if defined(HAS_RAWTOUVROW_NEON)
+#else
  if (TestCpuFlag(kCpuHasNEON)) {
    RAWToUVRow = RAWToUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      RAWToUVRow = RAWToUVRow_NEON;
    }
  }
 #endif
 #if defined(HAS_RAWTOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -955,59 +953,63 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif
-
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
  {
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
+    const int kRowSize = (width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
 #endif
    for (y = 0; y < height - 1; y += 2) {
-  #if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON)
      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
      RAWToYRow(src_raw, dst_y, width);
      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
-  #else
+#else
      RAWToARGBRow(src_raw, row, width);
      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
      ARGBToYRow(row, dst_y, width);
      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-  #endif
+#endif
      src_raw += src_stride_raw * 2;
      dst_y += dst_stride_y * 2;
      dst_u += dst_stride_u;
      dst_v += dst_stride_v;
    }
    if (height & 1) {
-  #if defined(HAS_RAWTOYROW_NEON)
+#if defined(HAS_RAWTOYROW_NEON)
      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
      RAWToYRow(src_raw, dst_y, width);
-  #else
+#else
      RAWToARGBRow(src_raw, row, width);
      ARGBToUVRow(row, 0, dst_u, dst_v, width);
      ARGBToYRow(row, dst_y, width);
-  #endif
+#endif
    }
-  #if !defined(HAS_RAWTOYROW_NEON)
+#if !defined(HAS_RAWTOYROW_NEON)
    free_aligned_buffer_64(row);
  #endif
  }
 #endif
  return 0;
 }
@@ -1043,19 +1045,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    src_stride_rgb565 = -src_stride_rgb565;
  }
 // Neon version does direct RGB565 to YUV.
 #if defined(HAS_RGB565TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
    RGB565ToYRow = RGB565ToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      RGB565ToYRow = RGB565ToYRow_NEON;
-    }
+      if (IS_ALIGNED(width, 16)) {
-    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
-    if (IS_ALIGNED(width, 16)) {
+      }
      RGB565ToUVRow = RGB565ToUVRow_NEON;
    }
  }
-#else  // HAS_RGB565TOYROW_NEON
+// Other platforms do intermediate conversion from RGB565 to ARGB.
-
+#else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
@@ -1064,28 +1067,37 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_RGB565TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif
-#endif  // HAS_RGB565TOYROW_NEON
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-
+  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
  {
 #if !defined(HAS_RGB565TOYROW_NEON)
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
+    const int kRowSize = (width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
 #endif
@@ -1118,8 +1130,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    }
 #if !defined(HAS_RGB565TOYROW_NEON)
    free_aligned_buffer_64(row);
 #endif
  }
 #endif
  return 0;
 }
@@ -1155,19 +1167,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
    src_stride_argb1555 = -src_stride_argb1555;
  }
 // Neon version does direct ARGB1555 to YUV.
 #if defined(HAS_ARGB1555TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
-    }
+      if (IS_ALIGNED(width, 16)) {
-    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
-    if (IS_ALIGNED(width, 16)) {
+      }
      ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
    }
  }
-#else  // HAS_ARGB1555TOYROW_NEON
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
-
+#else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
@@ -1176,30 +1189,40 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif
-#endif  // HAS_ARGB1555TOYROW_NEON
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-
+  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
  {
 #if !defined(HAS_ARGB1555TOYROW_NEON)
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
+    const int kRowSize = (width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
 #endif
    for (y = 0; y < height - 1; y += 2) {
 #if defined(HAS_ARGB1555TOYROW_NEON)
      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
@@ -1230,9 +1253,9 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
 #endif
    }
 #if !defined(HAS_ARGB1555TOYROW_NEON)
-  free_aligned_buffer_64(row);
+    free_aligned_buffer_64(row);
 #endif
  }
 #endif
  return 0;
 }
@@ -1268,19 +1291,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
    src_stride_argb4444 = -src_stride_argb4444;
  }
 // Neon version does direct ARGB4444 to YUV.
 #if defined(HAS_ARGB4444TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
-    }
+      if (IS_ALIGNED(width, 16)) {
-    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
-    if (IS_ALIGNED(width, 16)) {
+      }
      ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
    }
  }
-#else  // HAS_ARGB4444TOYROW_NEON
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
-
+#else
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
@@ -1289,28 +1313,37 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_ARGBTOUVROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToUVRow = ARGBToUVRow_SSSE3;
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-#endif  // HAS_ARGBTOUVROW_SSSE3
+#endif
-#endif  // HAS_ARGB4444TOYROW_NEON
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-
+  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
  {
 #if !defined(HAS_ARGB4444TOYROW_NEON)
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
+    const int kRowSize = (width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
 #endif
@@ -1345,8 +1378,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
    }
 #if !defined(HAS_ARGB4444TOYROW_NEON)
    free_aligned_buffer_64(row);
 #endif
  }
 #endif
  return 0;
 }
--- a/third_party/libyuv/source/convert_argb.cc
+++ b/third_party/libyuv/source/convert_argb.cc
@@ -85,6 +85,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_I444TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I444ToARGBRow = I444ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_I444TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    I444ToARGBRow = I444ToARGBRow_Any_NEON;
@@ -222,6 +230,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_I411TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I411ToARGBRow = I411ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_I411TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    I411ToARGBRow = I411ToARGBRow_Any_NEON;
@@ -243,13 +259,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
 // Convert I400 to ARGB.
 LIBYUV_API
-int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+int I400ToARGB(const uint8* src_y, int src_stride_y,
-                         uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
-                         int width, int height) {
+               int width, int height) {
  int y;
-  void (*YToARGBRow)(const uint8* y_buf,
+  void (*I400ToARGBRow)(const uint8* y_buf,
                     uint8* rgb_buf,
-                     int width) = YToARGBRow_C;
+                     int width) = I400ToARGBRow_C;
  if (!src_y || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
@@ -267,47 +283,47 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
    height = 1;
    src_stride_y = dst_stride_argb = 0;
  }
-#if defined(HAS_YTOARGBROW_SSE2)
+#if defined(HAS_I400TOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    YToARGBRow = YToARGBRow_Any_SSE2;
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
    if (IS_ALIGNED(width, 8)) {
-      YToARGBRow = YToARGBRow_SSE2;
+      I400ToARGBRow = I400ToARGBRow_SSE2;
    }
  }
 #endif
-#if defined(HAS_YTOARGBROW_AVX2)
+#if defined(HAS_I400TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
-    YToARGBRow = YToARGBRow_Any_AVX2;
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
-      YToARGBRow = YToARGBRow_AVX2;
+      I400ToARGBRow = I400ToARGBRow_AVX2;
    }
  }
 #endif
-#if defined(HAS_YTOARGBROW_NEON)
+#if defined(HAS_I400TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    YToARGBRow = YToARGBRow_Any_NEON;
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      YToARGBRow = YToARGBRow_NEON;
+      I400ToARGBRow = I400ToARGBRow_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
-    YToARGBRow(src_y, dst_argb, width);
+    I400ToARGBRow(src_y, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
  }
  return 0;
 }
-// Convert I400 to ARGB.
+// Convert J400 to ARGB.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
+int J400ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
  int y;
-  void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
-      I400ToARGBRow_C;
+      J400ToARGBRow_C;
  if (!src_y || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
@@ -325,24 +341,32 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
    height = 1;
    src_stride_y = dst_stride_argb = 0;
  }
-#if defined(HAS_I400TOARGBROW_SSE2)
+#if defined(HAS_J400TOARGBROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_SSE2;
+      J400ToARGBRow = J400ToARGBRow_SSE2;
    }
  }
 #endif
-#if defined(HAS_I400TOARGBROW_NEON)
+#if defined(HAS_J400TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      J400ToARGBRow = J400ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_J400TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_NEON;
+      J400ToARGBRow = J400ToARGBRow_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
-    I400ToARGBRow(src_y, dst_argb, width);
+    J400ToARGBRow(src_y, dst_argb, width);
    src_y += src_stride_y;
    dst_argb += dst_stride_argb;
  }
@@ -552,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
    }
  }
 #endif
 #if defined(HAS_RGB565TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_RGB565TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
@@ -602,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
    }
  }
 #endif
 #if defined(HAS_ARGB1555TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGB1555TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
@@ -652,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
    }
  }
 #endif
 #if defined(HAS_ARGB4444TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGB4444TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
--- a/third_party/libyuv/source/convert_from.cc
+++ b/third_party/libyuv/source/convert_from.cc
@@ -739,6 +739,14 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_I422TORGB24ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I422ToRGB24Row = I422ToRGB24Row_AVX2;
    }
  }
 #endif
 #if defined(HAS_I422TORGB24ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@@ -791,6 +799,14 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_I422TORAWROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I422ToRAWRow = I422ToRAWRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_I422TORAWROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    I422ToRAWRow = I422ToRAWRow_Any_NEON;
@@ -993,6 +1009,117 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
 static const uint8 kDither565_4x4[16] = {
  0, 4, 1, 5,
  6, 2, 7, 3,
  1, 5, 0, 4,
  7, 3, 6, 2,
 };
 // Convert I420 to RGB565 with dithering.
 LIBYUV_API
 int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
                       const uint8* src_u, int src_stride_u,
                       const uint8* src_v, int src_stride_v,
                       uint8* dst_rgb565, int dst_stride_rgb565,
                       const uint8* dither4x4, int width, int height) {
  int y;
  void (*I422ToARGBRow)(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width) = I422ToARGBRow_C;
  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
    dst_stride_rgb565 = -dst_stride_rgb565;
  }
  if (!dither4x4) {
    dither4x4 = kDither565_4x4;
  }
 #if defined(HAS_I422TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      I422ToARGBRow = I422ToARGBRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I422ToARGBRow = I422ToARGBRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    I422ToARGBRow = I422ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      I422ToARGBRow = I422ToARGBRow_NEON;
    }
  }
 #endif
 #if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
  }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
    }
  }
 #endif
  {
    // Allocate a row of argb.
    align_buffer_64(row_argb, width * 4);
    for (y = 0; y < height; ++y) {
      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
      dst_rgb565 += dst_stride_rgb565;
      src_y += src_stride_y;
      if (y & 1) {
        src_u += src_stride_u;
        src_v += src_stride_v;
      }
    }
    free_aligned_buffer_64(row_argb);
  }
  return 0;
 }
 // Convert I420 to specified format
 LIBYUV_API
 int ConvertFromI420(const uint8* y, int y_stride,
--- a/third_party/libyuv/source/convert_from_argb.cc
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -72,7 +72,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
      ARGBToYRow = ARGBToYRow_SSSE3;
    }
  }
-
+#endif
 #if defined(HAS_ARGBTOYROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
@@ -139,7 +146,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -148,6 +154,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -275,6 +289,16 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -317,8 +341,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
 #endif
  {
    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
    for (y = 0; y < height - 1; y += 2) {
      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -374,6 +398,16 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToUVRow = ARGBToUVRow_AVX2;
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -416,8 +450,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
 #endif
  {
    // Allocate a rows of uv.
-    align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
    for (y = 0; y < height - 1; y += 2) {
      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -492,6 +526,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -591,6 +633,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -804,25 +854,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
-static const uint8 kDither8x8[64] = {
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-  0, 128, 32, 160,  8, 136, 40, 168,
+static const uint8 kDither565_4x4[16] = {
-  192, 64, 224, 96, 200, 72, 232, 104,
+  0, 4, 1, 5,
-  48, 176, 16, 144, 56, 184, 24, 152,
+  6, 2, 7, 3,
-  240, 112, 208, 80, 248, 120, 216, 88,
+  1, 5, 0, 4,
-  12, 140, 44, 172,  4, 132, 36, 164,
+  7, 3, 6, 2,
  204, 76, 236, 108, 196, 68, 228, 100,
  60, 188, 28, 156, 52, 180, 20, 148,
  252, 124, 220, 92, 244, 116, 212, 84,
 };
-// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 LIBYUV_API
 int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither8x8, int width, int height) {
+                       const uint8* dither4x4, int width, int height) {
  int y;
  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
    return -1;
  }
@@ -831,13 +878,36 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
    src_argb = src_argb + (height - 1) * src_stride_argb;
    src_stride_argb = -src_stride_argb;
  }
-  if (!dither8x8) {
+  if (!dither4x4) {
-    dither8x8 = kDither8x8;
+    dither4x4 = kDither565_4x4;
  }
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          dither8x8 + ((y & 7) << 3), width);
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
    src_argb += src_stride_argb;
    dst_rgb565 += dst_stride_rgb565;
  }
@@ -845,6 +915,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
 }
 // Convert ARGB To RGB565.
 // TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_rgb565, int dst_stride_rgb565,
@@ -1021,7 +1092,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
               int width, int height) {
  int y;
  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
      ARGBToYJRow_C;
  if (!src_argb ||
@@ -1045,7 +1116,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@@ -1140,6 +1211,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
    }
  }
 #endif
 #if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBToYJRow = ARGBToYJRow_Any_NEON;
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -10,13 +10,12 @@
 #include "libyuv/cpu_id.h"
-#if defined(_MSC_VER) && !defined(__clang__)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
 #include <intrin.h>  // For __cpuidex()
 #endif
 #if !defined(__pnacl__) && !defined(__CLR_VER) && \
-    !defined(__native_client__)  && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
    (defined(_M_IX86) || defined(_M_X64))
 #include <immintrin.h>  // For _xgetbv()
 #endif
@@ -37,23 +36,23 @@ extern "C" {
 // For functions that use the stack and have runtime checks for overflow,
 // use SAFEBUFFERS to avoid additional check.
-#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
 #define SAFEBUFFERS __declspec(safebuffers)
 #else
 #define SAFEBUFFERS
 #endif
-// Low level cpuid for X86. Returns zeros on other CPUs.
+// Low level cpuid for X86.
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if (defined(_M_IX86) || defined(_M_X64) || \
-    (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
-    defined(__i386__) || defined(__x86_64__))
+    !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
 void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER) && !defined(__clang__)
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
 // Visual C version uses intrinsic or inline x86 assembly.
 #if (_MSC_FULL_VER >= 160040219)
  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
-#endif
+#elif defined(_M_IX86)
 #if defined(_M_IX86)
  __asm {
    mov        eax, info_eax
    mov        ecx, info_ecx
@@ -71,7 +70,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
  }
 #endif
-#else  // defined(_MSC_VER)
+// GCC version uses inline x86 assembly.
 #else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
  uint32 info_ebx, info_edx;
  asm volatile (  // NOLINT
 #if defined( __i386__) && defined(__PIC__)
@@ -89,37 +89,38 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
  cpu_info[1] = info_ebx;
  cpu_info[2] = info_ecx;
  cpu_info[3] = info_edx;
-#endif  // defined(_MSC_VER)
+#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
 }
-
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
 #if !defined(__native_client__)
 #define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int TestOsSaveYmm() {
  uint32 xcr0 = 0u;
 #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
 #endif
 #if defined(_M_IX86) && defined(_MSC_VER)
  __asm {
    xor        ecx, ecx    // xcr 0
    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
    mov        xcr0, eax
  }
 #endif
 #if defined(__i386__) || defined(__x86_64__)
  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
 #endif  // defined(_MSC_VER)
  return((xcr0 & 6) == 6);  // Is ymm saved?
 }
 #endif  // !defined(__native_client__)
 #else
 LIBYUV_API
 void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
 // TODO(fbarchard): Enable xgetbv when validator supports it.
 #if (defined(_M_IX86) || defined(_M_X64) || \
    defined(__i386__) || defined(__x86_64__)) && \
    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
 #define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int TestOsSaveYmm() {
  uint32 xcr0 = 0u;
 #if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
 #elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
  __asm {
    xor        ecx, ecx    // xcr 0
    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
    mov        xcr0, eax
  }
 #elif defined(__i386__) || defined(__x86_64__)
  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
 #endif  // defined(__i386__) || defined(__x86_64__)
  return((xcr0 & 6) == 6);  // Is ymm saved?
 }
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
 LIBYUV_API SAFEBUFFERS
--- a/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -18,6 +18,12 @@
 // Must be included before jpeglib.
 #include <setjmp.h>
 #define HAVE_SETJMP
 #if defined(_MSC_VER)
 // disable warning 4324: structure was padded due to __declspec(align())
 #pragma warning(disable:4324)
 #endif
 #endif
 struct FILE;  // For jpeglib.h.
--- a/third_party/libyuv/source/mjpeg_validate.cc
+++ b/third_party/libyuv/source/mjpeg_validate.cc
@@ -23,7 +23,7 @@ extern "C" {
 #ifdef ENABLE_SCASB
 // Multiple of 1.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
  __asm {
    mov        edx, edi
--- a/third_party/libyuv/source/planar_functions.cc
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
  return 0;
 }
-// Get a blender that optimized for the CPU, alignment and pixel count.
+// Get a blender that optimized for the CPU and pixel count.
 // As there are 6 blenders to choose from, the caller should try to use
 // the same blend function for all pixels if possible.
 LIBYUV_API
@@ -677,12 +677,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
    height = 1;
    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
  }
-#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBAddRow = ARGBAddRow_SSE2;
  }
 #endif
-#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBAddRow = ARGBAddRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
@@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
                                         const uint8* src_sobely,
                                         uint8* dst, int width)) {
  int y;
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
-                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
+      ARGBToYJRow_C;
  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
                    uint8* dst_sobely, int width) = SobelYRow_C;
  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
@@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
    src_argb  = src_argb  + (height - 1) * src_stride_argb;
    src_stride_argb = -src_stride_argb;
  }
-  // ARGBToBayer used to select G channel from ARGB.
+
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
    if (IS_ALIGNED(width, 8)) {
      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+#if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerGGRow_NEON;
+      ARGBToYJRow = ARGBToYJRow_NEON;
    }
  }
 #endif
 #if defined(HAS_SOBELYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    SobelYRow = SobelYRow_SSE2;
@@ -2040,7 +2041,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 #endif
  {
    // 3 rows with edges before/after.
-    const int kRowSize = (width + kEdge + 15) & ~15;
+    const int kRowSize = (width + kEdge + 31) & ~31;
    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
    uint8* row_sobelx = rows;
    uint8* row_sobely = rows + kRowSize;
@@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
    uint8* row_y0 = row_y + kEdge;
    uint8* row_y1 = row_y0 + kRowSize;
    uint8* row_y2 = row_y1 + kRowSize;
-    ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y0, width);
    row_y0[-1] = row_y0[0];
    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y1, width);
    row_y1[-1] = row_y1[0];
    memset(row_y1 + width, row_y1[width - 1], 16);
    memset(row_y2 + width, 0, 16);
    for (y = 0; y < height; ++y) {
-      // Convert next row of ARGB to Y.
+      // Convert next row of ARGB to G.
      if (y < (height - 1)) {
        src_argb += src_stride_argb;
      }
-      ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+      ARGBToYJRow(src_argb, row_y2, width);
      row_y2[-1] = row_y2[0];
      row_y2[width] = row_y2[width - 1];
@@ -2094,13 +2095,19 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
                   uint8* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelRow = SobelRow_SSE2;
+    SobelRow = SobelRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      SobelRow = SobelRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_SOBELROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelRow = SobelRow_NEON;
+    SobelRow = SobelRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      SobelRow = SobelRow_NEON;
    }
  }
 #endif
  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2115,13 +2122,19 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
                          uint8* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      SobelToPlaneRow = SobelToPlaneRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_SOBELTOPLANEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelToPlaneRow = SobelToPlaneRow_NEON;
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      SobelToPlaneRow = SobelToPlaneRow_NEON;
    }
  }
 #endif
  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
@@ -2137,13 +2150,19 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                     uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    SobelXYRow = SobelXYRow_SSE2;
+    SobelXYRow = SobelXYRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      SobelXYRow = SobelXYRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_SOBELXYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXYRow = SobelXYRow_NEON;
+    SobelXYRow = SobelXYRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      SobelXYRow = SobelXYRow_NEON;
    }
  }
 #endif
  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2322,6 +2341,214 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
  return 0;
 }
 LIBYUV_API
 int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_uv, int dst_stride_uv,
               int width, int height) {
  int y;
  int halfwidth = (width + 1) >> 1;
  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
      SplitUVRow_C;
  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_C;
  if (!src_yuy2 ||
      !dst_y || !dst_uv ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
    src_stride_yuy2 = -src_stride_yuy2;
  }
 #if defined(HAS_SPLITUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    SplitUVRow = SplitUVRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      SplitUVRow = SplitUVRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    SplitUVRow = SplitUVRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      SplitUVRow = SplitUVRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    SplitUVRow = SplitUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      SplitUVRow = SplitUVRow_NEON;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
  {
    int awidth = halfwidth * 2;
    // 2 rows of uv
    align_buffer_64(rows, awidth * 2);
    for (y = 0; y < height - 1; y += 2) {
      // Split Y from UV.
      SplitUVRow(src_yuy2, dst_y, rows, awidth);
      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
                 rows + awidth, awidth);
      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
      src_yuy2 += src_stride_yuy2 * 2;
      dst_y += dst_stride_y * 2;
      dst_uv += dst_stride_uv;
    }
    if (height & 1) {
      // Split Y from UV.
      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
    }
    free_aligned_buffer_64(rows);
  }
  return 0;
 }
 LIBYUV_API
 int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_uv, int dst_stride_uv,
               int width, int height) {
  int y;
  int halfwidth = (width + 1) >> 1;
  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
      SplitUVRow_C;
  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) = InterpolateRow_C;
  if (!src_uyvy ||
      !dst_y || !dst_uv ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
    src_stride_uyvy = -src_stride_uyvy;
  }
 #if defined(HAS_SPLITUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    SplitUVRow = SplitUVRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      SplitUVRow = SplitUVRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    SplitUVRow = SplitUVRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      SplitUVRow = SplitUVRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_SPLITUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    SplitUVRow = SplitUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      SplitUVRow = SplitUVRow_NEON;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    InterpolateRow = InterpolateRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      InterpolateRow = InterpolateRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    InterpolateRow = InterpolateRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_NEON;
    }
  }
 #endif
  {
    int awidth = halfwidth * 2;
    // 2 rows of uv
    align_buffer_64(rows, awidth * 2);
    for (y = 0; y < height - 1; y += 2) {
      // Split Y from UV.
      SplitUVRow(src_uyvy, rows, dst_y, awidth);
      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
                 dst_y + dst_stride_y, awidth);
      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
      src_uyvy += src_stride_uyvy * 2;
      dst_y += dst_stride_y * 2;
      dst_uv += dst_stride_uv;
    }
    if (height & 1) {
      // Split Y from UV.
      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
    }
    free_aligned_buffer_64(rows);
  }
  return 0;
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/third_party/libyuv/source/rotate.cc
+++ b/third_party/libyuv/source/rotate.cc
@@ -13,6 +13,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus
@@ -20,809 +21,39 @@ namespace libyuv {
 extern "C" {
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #if defined(__APPLE__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".private_extern _" #name "                \n"                             \
    ".align 4,0x90                             \n"                             \
 "_" #name ":                                   \n"
 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".align 4,0x90                             \n"                             \
 "_" #name ":                                   \n"
 #else
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".align 4,0x90                             \n"                             \
 #name ":                                       \n"
 #endif
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSE_WX8_NEON
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride, int width);
 #define HAS_TRANSPOSE_UVWX8_NEON
 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width);
 #endif
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
    defined(__mips__) && \
    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                             uint8* dst, int dst_stride, int width);
 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
                                  uint8* dst, int dst_stride, int width);
 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                               uint8* dst_a, int dst_stride_a,
                               uint8* dst_b, int dst_stride_b,
                               int width);
 #endif  // defined(__mips__)
 #if !defined(LIBYUV_DISABLE_X86) && \
    defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSE_WX8_SSSE3
 __declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
  __asm {
    push      edi
    push      esi
    push      ebp
    mov       eax, [esp + 12 + 4]   // src
    mov       edi, [esp + 12 + 8]   // src_stride
    mov       edx, [esp + 12 + 12]  // dst
    mov       esi, [esp + 12 + 16]  // dst_stride
    mov       ecx, [esp + 12 + 20]  // width
    // Read in the data from the source pointer.
    // First round of bit swap.
    align      4
 convertloop:
    movq      xmm0, qword ptr [eax]
    lea       ebp, [eax + 8]
    movq      xmm1, qword ptr [eax + edi]
    lea       eax, [eax + 2 * edi]
    punpcklbw xmm0, xmm1
    movq      xmm2, qword ptr [eax]
    movdqa    xmm1, xmm0
    palignr   xmm1, xmm1, 8
    movq      xmm3, qword ptr [eax + edi]
    lea       eax, [eax + 2 * edi]
    punpcklbw xmm2, xmm3
    movdqa    xmm3, xmm2
    movq      xmm4, qword ptr [eax]
    palignr   xmm3, xmm3, 8
    movq      xmm5, qword ptr [eax + edi]
    punpcklbw xmm4, xmm5
    lea       eax, [eax + 2 * edi]
    movdqa    xmm5, xmm4
    movq      xmm6, qword ptr [eax]
    palignr   xmm5, xmm5, 8
    movq      xmm7, qword ptr [eax + edi]
    punpcklbw xmm6, xmm7
    mov       eax, ebp
    movdqa    xmm7, xmm6
    palignr   xmm7, xmm7, 8
    // Second round of bit swap.
    punpcklwd xmm0, xmm2
    punpcklwd xmm1, xmm3
    movdqa    xmm2, xmm0
    movdqa    xmm3, xmm1
    palignr   xmm2, xmm2, 8
    palignr   xmm3, xmm3, 8
    punpcklwd xmm4, xmm6
    punpcklwd xmm5, xmm7
    movdqa    xmm6, xmm4
    movdqa    xmm7, xmm5
    palignr   xmm6, xmm6, 8
    palignr   xmm7, xmm7, 8
    // Third round of bit swap.
    // Write to the destination pointer.
    punpckldq xmm0, xmm4
    movq      qword ptr [edx], xmm0
    movdqa    xmm4, xmm0
    palignr   xmm4, xmm4, 8
    movq      qword ptr [edx + esi], xmm4
    lea       edx, [edx + 2 * esi]
    punpckldq xmm2, xmm6
    movdqa    xmm6, xmm2
    palignr   xmm6, xmm6, 8
    movq      qword ptr [edx], xmm2
    punpckldq xmm1, xmm5
    movq      qword ptr [edx + esi], xmm6
    lea       edx, [edx + 2 * esi]
    movdqa    xmm5, xmm1
    movq      qword ptr [edx], xmm1
    palignr   xmm5, xmm5, 8
    punpckldq xmm3, xmm7
    movq      qword ptr [edx + esi], xmm5
    lea       edx, [edx + 2 * esi]
    movq      qword ptr [edx], xmm3
    movdqa    xmm7, xmm3
    palignr   xmm7, xmm7, 8
    sub       ecx, 8
    movq      qword ptr [edx + esi], xmm7
    lea       edx, [edx + 2 * esi]
    jg        convertloop
    pop       ebp
    pop       esi
    pop       edi
    ret
  }
 }
 #define HAS_TRANSPOSE_UVWX8_SSE2
 __declspec(naked) __declspec(align(16))
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int w) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       eax, [esp + 16 + 4]   // src
    mov       edi, [esp + 16 + 8]   // src_stride
    mov       edx, [esp + 16 + 12]  // dst_a
    mov       esi, [esp + 16 + 16]  // dst_stride_a
    mov       ebx, [esp + 16 + 20]  // dst_b
    mov       ebp, [esp + 16 + 24]  // dst_stride_b
    mov       ecx, esp
    sub       esp, 4 + 16
    and       esp, ~15
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w
    align      4
 convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movdqu    xmm0, [eax]
    movdqu    xmm1, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm0  // use xmm7 as temp register.
    punpcklbw xmm0, xmm1
    punpckhbw xmm7, xmm1
    movdqa    xmm1, xmm7
    movdqu    xmm2, [eax]
    movdqu    xmm3, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm2
    punpcklbw xmm2, xmm3
    punpckhbw xmm7, xmm3
    movdqa    xmm3, xmm7
    movdqu    xmm4, [eax]
    movdqu    xmm5, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm4
    punpcklbw xmm4, xmm5
    punpckhbw xmm7, xmm5
    movdqa    xmm5, xmm7
    movdqu    xmm6, [eax]
    movdqu    xmm7, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqu    [esp], xmm5  // backup xmm5
    neg       edi
    movdqa    xmm5, xmm6   // use xmm5 as temp register.
    punpcklbw xmm6, xmm7
    punpckhbw xmm5, xmm7
    movdqa    xmm7, xmm5
    lea       eax, [eax + 8 * edi + 16]
    neg       edi
    // Second round of bit swap.
    movdqa    xmm5, xmm0
    punpcklwd xmm0, xmm2
    punpckhwd xmm5, xmm2
    movdqa    xmm2, xmm5
    movdqa    xmm5, xmm1
    punpcklwd xmm1, xmm3
    punpckhwd xmm5, xmm3
    movdqa    xmm3, xmm5
    movdqa    xmm5, xmm4
    punpcklwd xmm4, xmm6
    punpckhwd xmm5, xmm6
    movdqa    xmm6, xmm5
    movdqu    xmm5, [esp]  // restore xmm5
    movdqu    [esp], xmm6  // backup xmm6
    movdqa    xmm6, xmm5    // use xmm6 as temp register.
    punpcklwd xmm5, xmm7
    punpckhwd xmm6, xmm7
    movdqa    xmm7, xmm6
    // Third round of bit swap.
    // Write to the destination pointer.
    movdqa    xmm6, xmm0
    punpckldq xmm0, xmm4
    punpckhdq xmm6, xmm4
    movdqa    xmm4, xmm6
    movdqu    xmm6, [esp]  // restore xmm6
    movlpd    qword ptr [edx], xmm0
    movhpd    qword ptr [ebx], xmm0
    movlpd    qword ptr [edx + esi], xmm4
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm4
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    punpckldq xmm2, xmm6
    movlpd    qword ptr [edx], xmm2
    movhpd    qword ptr [ebx], xmm2
    punpckhdq xmm0, xmm6
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    punpckldq xmm1, xmm5
    movlpd    qword ptr [edx], xmm1
    movhpd    qword ptr [ebx], xmm1
    punpckhdq xmm0, xmm5
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    punpckldq xmm3, xmm7
    movlpd    qword ptr [edx], xmm3
    movhpd    qword ptr [ebx], xmm3
    punpckhdq xmm0, xmm7
    sub       ecx, 8
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    jg        convertloop
    mov       esp, [esp + 16]
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                               uint8* dst, int dst_stride, int width) {
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
    ".p2align  2                                 \n"
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm1,%%xmm0                    \n"
    "movq       (%0),%%xmm2                      \n"
    "movdqa     %%xmm0,%%xmm1                    \n"
    "palignr    $0x8,%%xmm1,%%xmm1               \n"
    "movq       (%0,%3),%%xmm3                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm3,%%xmm2                    \n"
    "movdqa     %%xmm2,%%xmm3                    \n"
    "movq       (%0),%%xmm4                      \n"
    "palignr    $0x8,%%xmm3,%%xmm3               \n"
    "movq       (%0,%3),%%xmm5                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm5,%%xmm4                    \n"
    "movdqa     %%xmm4,%%xmm5                    \n"
    "movq       (%0),%%xmm6                      \n"
    "palignr    $0x8,%%xmm5,%%xmm5               \n"
    "movq       (%0,%3),%%xmm7                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm7,%%xmm6                    \n"
    "neg        %3                               \n"
    "movdqa     %%xmm6,%%xmm7                    \n"
    "lea        0x8(%0,%3,8),%0                  \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    "neg        %3                               \n"
     // Second round of bit swap.
    "punpcklwd  %%xmm2,%%xmm0                    \n"
    "punpcklwd  %%xmm3,%%xmm1                    \n"
    "movdqa     %%xmm0,%%xmm2                    \n"
    "movdqa     %%xmm1,%%xmm3                    \n"
    "palignr    $0x8,%%xmm2,%%xmm2               \n"
    "palignr    $0x8,%%xmm3,%%xmm3               \n"
    "punpcklwd  %%xmm6,%%xmm4                    \n"
    "punpcklwd  %%xmm7,%%xmm5                    \n"
    "movdqa     %%xmm4,%%xmm6                    \n"
    "movdqa     %%xmm5,%%xmm7                    \n"
    "palignr    $0x8,%%xmm6,%%xmm6               \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    // Third round of bit swap.
    // Write to the destination pointer.
    "punpckldq  %%xmm4,%%xmm0                    \n"
    "movq       %%xmm0,(%1)                      \n"
    "movdqa     %%xmm0,%%xmm4                    \n"
    "palignr    $0x8,%%xmm4,%%xmm4               \n"
    "movq       %%xmm4,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "punpckldq  %%xmm6,%%xmm2                    \n"
    "movdqa     %%xmm2,%%xmm6                    \n"
    "movq       %%xmm2,(%1)                      \n"
    "palignr    $0x8,%%xmm6,%%xmm6               \n"
    "punpckldq  %%xmm5,%%xmm1                    \n"
    "movq       %%xmm6,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "movdqa     %%xmm1,%%xmm5                    \n"
    "movq       %%xmm1,(%1)                      \n"
    "palignr    $0x8,%%xmm5,%%xmm5               \n"
    "movq       %%xmm5,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "punpckldq  %%xmm7,%%xmm3                    \n"
    "movq       %%xmm3,(%1)                      \n"
    "movdqa     %%xmm3,%%xmm7                    \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    "sub        $0x8,%2                          \n"
    "movq       %%xmm7,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "jg         1b                               \n"
    : "+r"(src),    // %0
      "+r"(dst),    // %1
      "+r"(width)   // %2
    : "r"((intptr_t)(src_stride)),  // %3
      "r"((intptr_t)(dst_stride))   // %4
    : "memory", "cc",
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int w);
  asm (
    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
    "push   %ebx                               \n"
    "push   %esi                               \n"
    "push   %edi                               \n"
    "push   %ebp                               \n"
    "mov    0x14(%esp),%eax                    \n"
    "mov    0x18(%esp),%edi                    \n"
    "mov    0x1c(%esp),%edx                    \n"
    "mov    0x20(%esp),%esi                    \n"
    "mov    0x24(%esp),%ebx                    \n"
    "mov    0x28(%esp),%ebp                    \n"
    "mov    %esp,%ecx                          \n"
    "sub    $0x14,%esp                         \n"
    "and    $0xfffffff0,%esp                   \n"
    "mov    %ecx,0x10(%esp)                    \n"
    "mov    0x2c(%ecx),%ecx                    \n"
 "1:                                            \n"
    "movdqu (%eax),%xmm0                       \n"
    "movdqu (%eax,%edi,1),%xmm1                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm0,%xmm7                        \n"
    "punpcklbw %xmm1,%xmm0                     \n"
    "punpckhbw %xmm1,%xmm7                     \n"
    "movdqa %xmm7,%xmm1                        \n"
    "movdqu (%eax),%xmm2                       \n"
    "movdqu (%eax,%edi,1),%xmm3                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm2,%xmm7                        \n"
    "punpcklbw %xmm3,%xmm2                     \n"
    "punpckhbw %xmm3,%xmm7                     \n"
    "movdqa %xmm7,%xmm3                        \n"
    "movdqu (%eax),%xmm4                       \n"
    "movdqu (%eax,%edi,1),%xmm5                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm4,%xmm7                        \n"
    "punpcklbw %xmm5,%xmm4                     \n"
    "punpckhbw %xmm5,%xmm7                     \n"
    "movdqa %xmm7,%xmm5                        \n"
    "movdqu (%eax),%xmm6                       \n"
    "movdqu (%eax,%edi,1),%xmm7                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqu %xmm5,(%esp)                       \n"
    "neg    %edi                               \n"
    "movdqa %xmm6,%xmm5                        \n"
    "punpcklbw %xmm7,%xmm6                     \n"
    "punpckhbw %xmm7,%xmm5                     \n"
    "movdqa %xmm5,%xmm7                        \n"
    "lea    0x10(%eax,%edi,8),%eax             \n"
    "neg    %edi                               \n"
    "movdqa %xmm0,%xmm5                        \n"
    "punpcklwd %xmm2,%xmm0                     \n"
    "punpckhwd %xmm2,%xmm5                     \n"
    "movdqa %xmm5,%xmm2                        \n"
    "movdqa %xmm1,%xmm5                        \n"
    "punpcklwd %xmm3,%xmm1                     \n"
    "punpckhwd %xmm3,%xmm5                     \n"
    "movdqa %xmm5,%xmm3                        \n"
    "movdqa %xmm4,%xmm5                        \n"
    "punpcklwd %xmm6,%xmm4                     \n"
    "punpckhwd %xmm6,%xmm5                     \n"
    "movdqa %xmm5,%xmm6                        \n"
    "movdqu (%esp),%xmm5                       \n"
    "movdqu %xmm6,(%esp)                       \n"
    "movdqa %xmm5,%xmm6                        \n"
    "punpcklwd %xmm7,%xmm5                     \n"
    "punpckhwd %xmm7,%xmm6                     \n"
    "movdqa %xmm6,%xmm7                        \n"
    "movdqa %xmm0,%xmm6                        \n"
    "punpckldq %xmm4,%xmm0                     \n"
    "punpckhdq %xmm4,%xmm6                     \n"
    "movdqa %xmm6,%xmm4                        \n"
    "movdqu (%esp),%xmm6                       \n"
    "movlpd %xmm0,(%edx)                       \n"
    "movhpd %xmm0,(%ebx)                       \n"
    "movlpd %xmm4,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm2,%xmm0                        \n"
    "punpckldq %xmm6,%xmm2                     \n"
    "movlpd %xmm2,(%edx)                       \n"
    "movhpd %xmm2,(%ebx)                       \n"
    "punpckhdq %xmm6,%xmm0                     \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm1,%xmm0                        \n"
    "punpckldq %xmm5,%xmm1                     \n"
    "movlpd %xmm1,(%edx)                       \n"
    "movhpd %xmm1,(%ebx)                       \n"
    "punpckhdq %xmm5,%xmm0                     \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm3,%xmm0                        \n"
    "punpckldq %xmm7,%xmm3                     \n"
    "movlpd %xmm3,(%edx)                       \n"
    "movhpd %xmm3,(%ebx)                       \n"
    "punpckhdq %xmm7,%xmm0                     \n"
    "sub    $0x8,%ecx                          \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "jg     1b                                 \n"
    "mov    0x10(%esp),%esp                    \n"
    "pop    %ebp                               \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "pop    %ebx                               \n"
 #if defined(__native_client__)
    "pop    %ecx                               \n"
    "and    $0xffffffe0,%ecx                   \n"
    "jmp    *%ecx                              \n"
 #else
    "ret                                       \n"
 #endif
 );
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
    defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
                                    uint8* dst, int dst_stride, int width) {
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     (%0,%3),%%xmm1                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpcklbw  %%xmm1,%%xmm0                    \n"
  "punpckhbw  %%xmm1,%%xmm8                    \n"
  "movdqu     (%0),%%xmm2                      \n"
  "movdqa     %%xmm0,%%xmm1                    \n"
  "movdqa     %%xmm8,%%xmm9                    \n"
  "palignr    $0x8,%%xmm1,%%xmm1               \n"
  "palignr    $0x8,%%xmm9,%%xmm9               \n"
  "movdqu     (%0,%3),%%xmm3                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm2,%%xmm10                   \n"
  "punpcklbw  %%xmm3,%%xmm2                    \n"
  "punpckhbw  %%xmm3,%%xmm10                   \n"
  "movdqa     %%xmm2,%%xmm3                    \n"
  "movdqa     %%xmm10,%%xmm11                  \n"
  "movdqu     (%0),%%xmm4                      \n"
  "palignr    $0x8,%%xmm3,%%xmm3               \n"
  "palignr    $0x8,%%xmm11,%%xmm11             \n"
  "movdqu     (%0,%3),%%xmm5                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm4,%%xmm12                   \n"
  "punpcklbw  %%xmm5,%%xmm4                    \n"
  "punpckhbw  %%xmm5,%%xmm12                   \n"
  "movdqa     %%xmm4,%%xmm5                    \n"
  "movdqa     %%xmm12,%%xmm13                  \n"
  "movdqu     (%0),%%xmm6                      \n"
  "palignr    $0x8,%%xmm5,%%xmm5               \n"
  "palignr    $0x8,%%xmm13,%%xmm13             \n"
  "movdqu     (%0,%3),%%xmm7                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm6,%%xmm14                   \n"
  "punpcklbw  %%xmm7,%%xmm6                    \n"
  "punpckhbw  %%xmm7,%%xmm14                   \n"
  "neg        %3                               \n"
  "movdqa     %%xmm6,%%xmm7                    \n"
  "movdqa     %%xmm14,%%xmm15                  \n"
  "lea        0x10(%0,%3,8),%0                 \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  "neg        %3                               \n"
   // Second round of bit swap.
  "punpcklwd  %%xmm2,%%xmm0                    \n"
  "punpcklwd  %%xmm3,%%xmm1                    \n"
  "movdqa     %%xmm0,%%xmm2                    \n"
  "movdqa     %%xmm1,%%xmm3                    \n"
  "palignr    $0x8,%%xmm2,%%xmm2               \n"
  "palignr    $0x8,%%xmm3,%%xmm3               \n"
  "punpcklwd  %%xmm6,%%xmm4                    \n"
  "punpcklwd  %%xmm7,%%xmm5                    \n"
  "movdqa     %%xmm4,%%xmm6                    \n"
  "movdqa     %%xmm5,%%xmm7                    \n"
  "palignr    $0x8,%%xmm6,%%xmm6               \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "punpcklwd  %%xmm10,%%xmm8                   \n"
  "punpcklwd  %%xmm11,%%xmm9                   \n"
  "movdqa     %%xmm8,%%xmm10                   \n"
  "movdqa     %%xmm9,%%xmm11                   \n"
  "palignr    $0x8,%%xmm10,%%xmm10             \n"
  "palignr    $0x8,%%xmm11,%%xmm11             \n"
  "punpcklwd  %%xmm14,%%xmm12                  \n"
  "punpcklwd  %%xmm15,%%xmm13                  \n"
  "movdqa     %%xmm12,%%xmm14                  \n"
  "movdqa     %%xmm13,%%xmm15                  \n"
  "palignr    $0x8,%%xmm14,%%xmm14             \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  // Third round of bit swap.
  // Write to the destination pointer.
  "punpckldq  %%xmm4,%%xmm0                    \n"
  "movq       %%xmm0,(%1)                      \n"
  "movdqa     %%xmm0,%%xmm4                    \n"
  "palignr    $0x8,%%xmm4,%%xmm4               \n"
  "movq       %%xmm4,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm6,%%xmm2                    \n"
  "movdqa     %%xmm2,%%xmm6                    \n"
  "movq       %%xmm2,(%1)                      \n"
  "palignr    $0x8,%%xmm6,%%xmm6               \n"
  "punpckldq  %%xmm5,%%xmm1                    \n"
  "movq       %%xmm6,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "movdqa     %%xmm1,%%xmm5                    \n"
  "movq       %%xmm1,(%1)                      \n"
  "palignr    $0x8,%%xmm5,%%xmm5               \n"
  "movq       %%xmm5,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm7,%%xmm3                    \n"
  "movq       %%xmm3,(%1)                      \n"
  "movdqa     %%xmm3,%%xmm7                    \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "movq       %%xmm7,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm12,%%xmm8                   \n"
  "movq       %%xmm8,(%1)                      \n"
  "movdqa     %%xmm8,%%xmm12                   \n"
  "palignr    $0x8,%%xmm12,%%xmm12             \n"
  "movq       %%xmm12,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm14,%%xmm10                  \n"
  "movdqa     %%xmm10,%%xmm14                  \n"
  "movq       %%xmm10,(%1)                     \n"
  "palignr    $0x8,%%xmm14,%%xmm14             \n"
  "punpckldq  %%xmm13,%%xmm9                   \n"
  "movq       %%xmm14,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "movdqa     %%xmm9,%%xmm13                   \n"
  "movq       %%xmm9,(%1)                      \n"
  "palignr    $0x8,%%xmm13,%%xmm13             \n"
  "movq       %%xmm13,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm15,%%xmm11                  \n"
  "movq       %%xmm11,(%1)                     \n"
  "movdqa     %%xmm11,%%xmm15                  \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  "sub        $0x10,%2                         \n"
  "movq       %%xmm15,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "r"((intptr_t)(src_stride)),  // %3
    "r"((intptr_t)(dst_stride))   // %4
  : "memory", "cc",
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
 );
 }
 #define HAS_TRANSPOSE_UVWX8_SSE2
 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                uint8* dst_a, int dst_stride_a,
                                uint8* dst_b, int dst_stride_b,
                                int w) {
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     (%0,%4),%%xmm1                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpcklbw  %%xmm1,%%xmm0                    \n"
  "punpckhbw  %%xmm1,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm1                    \n"
  "movdqu     (%0),%%xmm2                      \n"
  "movdqu     (%0,%4),%%xmm3                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm2,%%xmm8                    \n"
  "punpcklbw  %%xmm3,%%xmm2                    \n"
  "punpckhbw  %%xmm3,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm3                    \n"
  "movdqu     (%0),%%xmm4                      \n"
  "movdqu     (%0,%4),%%xmm5                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm4,%%xmm8                    \n"
  "punpcklbw  %%xmm5,%%xmm4                    \n"
  "punpckhbw  %%xmm5,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm5                    \n"
  "movdqu     (%0),%%xmm6                      \n"
  "movdqu     (%0,%4),%%xmm7                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm6,%%xmm8                    \n"
  "punpcklbw  %%xmm7,%%xmm6                    \n"
  "neg        %4                               \n"
  "lea        0x10(%0,%4,8),%0                 \n"
  "punpckhbw  %%xmm7,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm7                    \n"
  "neg        %4                               \n"
   // Second round of bit swap.
  "movdqa     %%xmm0,%%xmm8                    \n"
  "movdqa     %%xmm1,%%xmm9                    \n"
  "punpckhwd  %%xmm2,%%xmm8                    \n"
  "punpckhwd  %%xmm3,%%xmm9                    \n"
  "punpcklwd  %%xmm2,%%xmm0                    \n"
  "punpcklwd  %%xmm3,%%xmm1                    \n"
  "movdqa     %%xmm8,%%xmm2                    \n"
  "movdqa     %%xmm9,%%xmm3                    \n"
  "movdqa     %%xmm4,%%xmm8                    \n"
  "movdqa     %%xmm5,%%xmm9                    \n"
  "punpckhwd  %%xmm6,%%xmm8                    \n"
  "punpckhwd  %%xmm7,%%xmm9                    \n"
  "punpcklwd  %%xmm6,%%xmm4                    \n"
  "punpcklwd  %%xmm7,%%xmm5                    \n"
  "movdqa     %%xmm8,%%xmm6                    \n"
  "movdqa     %%xmm9,%%xmm7                    \n"
  // Third round of bit swap.
  // Write to the destination pointer.
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpckldq  %%xmm4,%%xmm0                    \n"
  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
  "punpckhdq  %%xmm4,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm2,%%xmm8                    \n"
  "punpckldq  %%xmm6,%%xmm2                    \n"
  "movlpd     %%xmm2,(%1)                      \n"
  "movhpd     %%xmm2,(%2)                      \n"
  "punpckhdq  %%xmm6,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm1,%%xmm8                    \n"
  "punpckldq  %%xmm5,%%xmm1                    \n"
  "movlpd     %%xmm1,(%1)                      \n"
  "movhpd     %%xmm1,(%2)                      \n"
  "punpckhdq  %%xmm5,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm3,%%xmm8                    \n"
  "punpckldq  %%xmm7,%%xmm3                    \n"
  "movlpd     %%xmm3,(%1)                      \n"
  "movhpd     %%xmm3,(%2)                      \n"
  "punpckhdq  %%xmm7,%%xmm8                    \n"
  "sub        $0x8,%3                          \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst_a),  // %1
    "+r"(dst_b),  // %2
    "+r"(w)   // %3
  : "r"((intptr_t)(src_stride)),    // %4
    "r"((intptr_t)(dst_stride_a)),  // %5
    "r"((intptr_t)(dst_stride_b))   // %6
  : "memory", "cc",
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    "xmm8", "xmm9"
 );
 }
 #endif
 #endif
 static void TransposeWx8_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int width) {
  int i;
  for (i = 0; i < width; ++i) {
    dst[0] = src[0 * src_stride];
    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
    dst[3] = src[3 * src_stride];
    dst[4] = src[4 * src_stride];
    dst[5] = src[5 * src_stride];
    dst[6] = src[6 * src_stride];
    dst[7] = src[7 * src_stride];
    ++src;
    dst += dst_stride;
  }
 }
 static void TransposeWxH_C(const uint8* src, int src_stride,
                           uint8* dst, int dst_stride,
                           int width, int height) {
  int i;
  for (i = 0; i < width; ++i) {
    int j;
    for (j = 0; j < height; ++j) {
      dst[i * dst_stride + j] = src[j * src_stride + i];
    }
  }
 }
 LIBYUV_API
 void TransposePlane(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  int i = height;
  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
-                       int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
 #if defined(HAS_TRANSPOSE_WX8_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    TransposeWx8 = TransposeWx8_NEON;
  }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_SSSE3)
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_SSSE3;
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      TransposeWx8 = TransposeWx8_SSSE3;
    }
  }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
-    TransposeWx8 = TransposeWx8_FAST_SSSE3;
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      TransposeWx8 = TransposeWx8_Fast_SSSE3;
    }
  }
 #endif
-#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
    if (IS_ALIGNED(width, 4) &&
        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
    } else {
      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
    }
@@ -837,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride,
    i -= 8;
  }
-  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  if (i > 0) {
    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
  }
 }
 LIBYUV_API
@@ -955,48 +188,6 @@ void RotatePlane180(const uint8* src, int src_stride,
  free_aligned_buffer_64(row);
 }
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int width) {
  int i;
  for (i = 0; i < width; ++i) {
    dst_a[0] = src[0 * src_stride + 0];
    dst_b[0] = src[0 * src_stride + 1];
    dst_a[1] = src[1 * src_stride + 0];
    dst_b[1] = src[1 * src_stride + 1];
    dst_a[2] = src[2 * src_stride + 0];
    dst_b[2] = src[2 * src_stride + 1];
    dst_a[3] = src[3 * src_stride + 0];
    dst_b[3] = src[3 * src_stride + 1];
    dst_a[4] = src[4 * src_stride + 0];
    dst_b[4] = src[4 * src_stride + 1];
    dst_a[5] = src[5 * src_stride + 0];
    dst_b[5] = src[5 * src_stride + 1];
    dst_a[6] = src[6 * src_stride + 0];
    dst_b[6] = src[6 * src_stride + 1];
    dst_a[7] = src[7 * src_stride + 0];
    dst_b[7] = src[7 * src_stride + 1];
    src += 2;
    dst_a += dst_stride_a;
    dst_b += dst_stride_b;
  }
 }
 static void TransposeUVWxH_C(const uint8* src, int src_stride,
                             uint8* dst_a, int dst_stride_a,
                             uint8* dst_b, int dst_stride_b,
                             int width, int height) {
  int i;
  for (i = 0; i < width * 2; i += 2) {
    int j;
    for (j = 0; j < height; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    }
  }
 }
 LIBYUV_API
 void TransposeUV(const uint8* src, int src_stride,
                 uint8* dst_a, int dst_stride_a,
@@ -1007,17 +198,17 @@ void TransposeUV(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int width) = TransposeUVWx8_C;
-#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    TransposeUVWx8 = TransposeUVWx8_NEON;
  }
 #endif
-#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
    TransposeUVWx8 = TransposeUVWx8_SSE2;
  }
 #endif
-#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@@ -1036,10 +227,12 @@ void TransposeUV(const uint8* src, int src_stride,
    i -= 8;
  }
-  TransposeUVWxH_C(src, src_stride,
+  if (i > 0) {
-                   dst_a, dst_stride_a,
+    TransposeUVWxH_C(src, src_stride,
-                   dst_b, dst_stride_b,
+                     dst_a, dst_stride_a,
-                   width, i);
+                     dst_b, dst_stride_b,
                     width, i);
  }
 }
 LIBYUV_API
--- a/third_party/libyuv/source/rotate_any.cc
+++ b/third_party/libyuv/source/rotate_any.cc
@@ -0,0 +1,55 @@
 /*
 *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/rotate.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
    void NAMEANY(const uint8* src, int src_stride,                             \
                 uint8* dst, int dst_stride, int width) {                      \
      int r = width & MASK;                                                    \
      int n = width - r;                                                       \
      if (n > 0) {                                                             \
        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
      }                                                                        \
      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
    }
 #ifdef HAS_TRANSPOSEWX8_NEON
 TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
 #endif
 #ifdef HAS_TRANSPOSEWX8_SSSE3
 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
 #endif
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
 #endif
 #ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
 TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
 #endif
 #undef TANY
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/third_party/libyuv/source/rotate_argb.cc
+++ b/third_party/libyuv/source/rotate_argb.cc
@@ -27,24 +27,20 @@ extern "C" {
    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
                               uint8* dst_ptr, int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
                               uint8* dst_ptr, int dst_width);
 #endif
 void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx,
+                            int src_stepx, uint8* dst_ptr, int dst_width);
                            uint8* dst_ptr, int dst_width);
 static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride,
+                          uint8* dst, int dst_stride, int width, int height) {
                          int width, int height) {
  int i;
  int src_pixel_step = src_stride >> 2;
  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
@@ -68,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
 }
 void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride,
+                  uint8* dst, int dst_stride, int width, int height) {
                  int width, int height) {
  // Rotate by 90 is a ARGBTranspose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
@@ -79,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
 }
 void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
+                    uint8* dst, int dst_stride, int width, int height) {
                    int width, int height) {
  // Rotate by 270 is a ARGBTranspose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
@@ -90,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
 }
 void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
+                   uint8* dst, int dst_stride, int width, int height) {
                   int width, int height) {
  // Swap first and last row and mirror the content. Uses a temporary row.
  align_buffer_64(row, width * 4);
  const uint8* src_bot = src + src_stride * (height - 1);
@@ -166,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
 LIBYUV_API
 int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
               int width, int height,
               enum RotationMode mode) {
  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
    return -1;
--- a/third_party/libyuv/source/rotate_common.cc
+++ b/third_party/libyuv/source/rotate_common.cc
@@ -0,0 +1,92 @@
 /*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 void TransposeWx8_C(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride, int width) {
  int i;
  for (i = 0; i < width; ++i) {
    dst[0] = src[0 * src_stride];
    dst[1] = src[1 * src_stride];
    dst[2] = src[2 * src_stride];
    dst[3] = src[3 * src_stride];
    dst[4] = src[4 * src_stride];
    dst[5] = src[5 * src_stride];
    dst[6] = src[6 * src_stride];
    dst[7] = src[7 * src_stride];
    ++src;
    dst += dst_stride;
  }
 }
 void TransposeUVWx8_C(const uint8* src, int src_stride,
                      uint8* dst_a, int dst_stride_a,
                      uint8* dst_b, int dst_stride_b, int width) {
  int i;
  for (i = 0; i < width; ++i) {
    dst_a[0] = src[0 * src_stride + 0];
    dst_b[0] = src[0 * src_stride + 1];
    dst_a[1] = src[1 * src_stride + 0];
    dst_b[1] = src[1 * src_stride + 1];
    dst_a[2] = src[2 * src_stride + 0];
    dst_b[2] = src[2 * src_stride + 1];
    dst_a[3] = src[3 * src_stride + 0];
    dst_b[3] = src[3 * src_stride + 1];
    dst_a[4] = src[4 * src_stride + 0];
    dst_b[4] = src[4 * src_stride + 1];
    dst_a[5] = src[5 * src_stride + 0];
    dst_b[5] = src[5 * src_stride + 1];
    dst_a[6] = src[6 * src_stride + 0];
    dst_b[6] = src[6 * src_stride + 1];
    dst_a[7] = src[7 * src_stride + 0];
    dst_b[7] = src[7 * src_stride + 1];
    src += 2;
    dst_a += dst_stride_a;
    dst_b += dst_stride_b;
  }
 }
 void TransposeWxH_C(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  int i;
  for (i = 0; i < width; ++i) {
    int j;
    for (j = 0; j < height; ++j) {
      dst[i * dst_stride + j] = src[j * src_stride + i];
    }
  }
 }
 void TransposeUVWxH_C(const uint8* src, int src_stride,
                      uint8* dst_a, int dst_stride_a,
                      uint8* dst_b, int dst_stride_b,
                      int width, int height) {
  int i;
  for (i = 0; i < width * 2; i += 2) {
    int j;
    for (j = 0; j < height; ++j) {
      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    }
  }
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/third_party/libyuv/source/rotate_gcc.cc
+++ b/third_party/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,493 @@
 /*
 *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #if !defined(LIBYUV_DISABLE_X86) && \
    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                        uint8* dst, int dst_stride, int width) {
  asm volatile (
    // Read in the data from the source pointer.
    // First round of bit swap.
    ".p2align  2                                 \n"
  "1:                                            \n"
    "movq       (%0),%%xmm0                      \n"
    "movq       (%0,%3),%%xmm1                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm1,%%xmm0                    \n"
    "movq       (%0),%%xmm2                      \n"
    "movdqa     %%xmm0,%%xmm1                    \n"
    "palignr    $0x8,%%xmm1,%%xmm1               \n"
    "movq       (%0,%3),%%xmm3                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm3,%%xmm2                    \n"
    "movdqa     %%xmm2,%%xmm3                    \n"
    "movq       (%0),%%xmm4                      \n"
    "palignr    $0x8,%%xmm3,%%xmm3               \n"
    "movq       (%0,%3),%%xmm5                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm5,%%xmm4                    \n"
    "movdqa     %%xmm4,%%xmm5                    \n"
    "movq       (%0),%%xmm6                      \n"
    "palignr    $0x8,%%xmm5,%%xmm5               \n"
    "movq       (%0,%3),%%xmm7                   \n"
    "lea        (%0,%3,2),%0                     \n"
    "punpcklbw  %%xmm7,%%xmm6                    \n"
    "neg        %3                               \n"
    "movdqa     %%xmm6,%%xmm7                    \n"
    "lea        0x8(%0,%3,8),%0                  \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    "neg        %3                               \n"
     // Second round of bit swap.
    "punpcklwd  %%xmm2,%%xmm0                    \n"
    "punpcklwd  %%xmm3,%%xmm1                    \n"
    "movdqa     %%xmm0,%%xmm2                    \n"
    "movdqa     %%xmm1,%%xmm3                    \n"
    "palignr    $0x8,%%xmm2,%%xmm2               \n"
    "palignr    $0x8,%%xmm3,%%xmm3               \n"
    "punpcklwd  %%xmm6,%%xmm4                    \n"
    "punpcklwd  %%xmm7,%%xmm5                    \n"
    "movdqa     %%xmm4,%%xmm6                    \n"
    "movdqa     %%xmm5,%%xmm7                    \n"
    "palignr    $0x8,%%xmm6,%%xmm6               \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    // Third round of bit swap.
    // Write to the destination pointer.
    "punpckldq  %%xmm4,%%xmm0                    \n"
    "movq       %%xmm0,(%1)                      \n"
    "movdqa     %%xmm0,%%xmm4                    \n"
    "palignr    $0x8,%%xmm4,%%xmm4               \n"
    "movq       %%xmm4,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "punpckldq  %%xmm6,%%xmm2                    \n"
    "movdqa     %%xmm2,%%xmm6                    \n"
    "movq       %%xmm2,(%1)                      \n"
    "palignr    $0x8,%%xmm6,%%xmm6               \n"
    "punpckldq  %%xmm5,%%xmm1                    \n"
    "movq       %%xmm6,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "movdqa     %%xmm1,%%xmm5                    \n"
    "movq       %%xmm1,(%1)                      \n"
    "palignr    $0x8,%%xmm5,%%xmm5               \n"
    "movq       %%xmm5,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "punpckldq  %%xmm7,%%xmm3                    \n"
    "movq       %%xmm3,(%1)                      \n"
    "movdqa     %%xmm3,%%xmm7                    \n"
    "palignr    $0x8,%%xmm7,%%xmm7               \n"
    "sub        $0x8,%2                          \n"
    "movq       %%xmm7,(%1,%4)                   \n"
    "lea        (%1,%4,2),%1                     \n"
    "jg         1b                               \n"
    : "+r"(src),    // %0
      "+r"(dst),    // %1
      "+r"(width)   // %2
    : "r"((intptr_t)(src_stride)),  // %3
      "r"((intptr_t)(dst_stride))   // %4
    : "memory", "cc",
      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b, int width);
  asm (
    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
    "push   %ebx                               \n"
    "push   %esi                               \n"
    "push   %edi                               \n"
    "push   %ebp                               \n"
    "mov    0x14(%esp),%eax                    \n"
    "mov    0x18(%esp),%edi                    \n"
    "mov    0x1c(%esp),%edx                    \n"
    "mov    0x20(%esp),%esi                    \n"
    "mov    0x24(%esp),%ebx                    \n"
    "mov    0x28(%esp),%ebp                    \n"
    "mov    %esp,%ecx                          \n"
    "sub    $0x14,%esp                         \n"
    "and    $0xfffffff0,%esp                   \n"
    "mov    %ecx,0x10(%esp)                    \n"
    "mov    0x2c(%ecx),%ecx                    \n"
 "1:                                            \n"
    "movdqu (%eax),%xmm0                       \n"
    "movdqu (%eax,%edi,1),%xmm1                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm0,%xmm7                        \n"
    "punpcklbw %xmm1,%xmm0                     \n"
    "punpckhbw %xmm1,%xmm7                     \n"
    "movdqa %xmm7,%xmm1                        \n"
    "movdqu (%eax),%xmm2                       \n"
    "movdqu (%eax,%edi,1),%xmm3                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm2,%xmm7                        \n"
    "punpcklbw %xmm3,%xmm2                     \n"
    "punpckhbw %xmm3,%xmm7                     \n"
    "movdqa %xmm7,%xmm3                        \n"
    "movdqu (%eax),%xmm4                       \n"
    "movdqu (%eax,%edi,1),%xmm5                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqa %xmm4,%xmm7                        \n"
    "punpcklbw %xmm5,%xmm4                     \n"
    "punpckhbw %xmm5,%xmm7                     \n"
    "movdqa %xmm7,%xmm5                        \n"
    "movdqu (%eax),%xmm6                       \n"
    "movdqu (%eax,%edi,1),%xmm7                \n"
    "lea    (%eax,%edi,2),%eax                 \n"
    "movdqu %xmm5,(%esp)                       \n"
    "neg    %edi                               \n"
    "movdqa %xmm6,%xmm5                        \n"
    "punpcklbw %xmm7,%xmm6                     \n"
    "punpckhbw %xmm7,%xmm5                     \n"
    "movdqa %xmm5,%xmm7                        \n"
    "lea    0x10(%eax,%edi,8),%eax             \n"
    "neg    %edi                               \n"
    "movdqa %xmm0,%xmm5                        \n"
    "punpcklwd %xmm2,%xmm0                     \n"
    "punpckhwd %xmm2,%xmm5                     \n"
    "movdqa %xmm5,%xmm2                        \n"
    "movdqa %xmm1,%xmm5                        \n"
    "punpcklwd %xmm3,%xmm1                     \n"
    "punpckhwd %xmm3,%xmm5                     \n"
    "movdqa %xmm5,%xmm3                        \n"
    "movdqa %xmm4,%xmm5                        \n"
    "punpcklwd %xmm6,%xmm4                     \n"
    "punpckhwd %xmm6,%xmm5                     \n"
    "movdqa %xmm5,%xmm6                        \n"
    "movdqu (%esp),%xmm5                       \n"
    "movdqu %xmm6,(%esp)                       \n"
    "movdqa %xmm5,%xmm6                        \n"
    "punpcklwd %xmm7,%xmm5                     \n"
    "punpckhwd %xmm7,%xmm6                     \n"
    "movdqa %xmm6,%xmm7                        \n"
    "movdqa %xmm0,%xmm6                        \n"
    "punpckldq %xmm4,%xmm0                     \n"
    "punpckhdq %xmm4,%xmm6                     \n"
    "movdqa %xmm6,%xmm4                        \n"
    "movdqu (%esp),%xmm6                       \n"
    "movlpd %xmm0,(%edx)                       \n"
    "movhpd %xmm0,(%ebx)                       \n"
    "movlpd %xmm4,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm2,%xmm0                        \n"
    "punpckldq %xmm6,%xmm2                     \n"
    "movlpd %xmm2,(%edx)                       \n"
    "movhpd %xmm2,(%ebx)                       \n"
    "punpckhdq %xmm6,%xmm0                     \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm1,%xmm0                        \n"
    "punpckldq %xmm5,%xmm1                     \n"
    "movlpd %xmm1,(%edx)                       \n"
    "movhpd %xmm1,(%ebx)                       \n"
    "punpckhdq %xmm5,%xmm0                     \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "movdqa %xmm3,%xmm0                        \n"
    "punpckldq %xmm7,%xmm3                     \n"
    "movlpd %xmm3,(%edx)                       \n"
    "movhpd %xmm3,(%ebx)                       \n"
    "punpckhdq %xmm7,%xmm0                     \n"
    "sub    $0x8,%ecx                          \n"
    "movlpd %xmm0,(%edx,%esi,1)                \n"
    "lea    (%edx,%esi,2),%edx                 \n"
    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    "lea    (%ebx,%ebp,2),%ebx                 \n"
    "jg     1b                                 \n"
    "mov    0x10(%esp),%esp                    \n"
    "pop    %ebp                               \n"
    "pop    %edi                               \n"
    "pop    %esi                               \n"
    "pop    %ebx                               \n"
 #if defined(__native_client__)
    "pop    %ecx                               \n"
    "and    $0xffffffe0,%ecx                   \n"
    "jmp    *%ecx                              \n"
 #else
    "ret                                       \n"
 #endif
 );
 #endif
 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
    defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
                             uint8* dst, int dst_stride, int width) {
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     (%0,%3),%%xmm1                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpcklbw  %%xmm1,%%xmm0                    \n"
  "punpckhbw  %%xmm1,%%xmm8                    \n"
  "movdqu     (%0),%%xmm2                      \n"
  "movdqa     %%xmm0,%%xmm1                    \n"
  "movdqa     %%xmm8,%%xmm9                    \n"
  "palignr    $0x8,%%xmm1,%%xmm1               \n"
  "palignr    $0x8,%%xmm9,%%xmm9               \n"
  "movdqu     (%0,%3),%%xmm3                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm2,%%xmm10                   \n"
  "punpcklbw  %%xmm3,%%xmm2                    \n"
  "punpckhbw  %%xmm3,%%xmm10                   \n"
  "movdqa     %%xmm2,%%xmm3                    \n"
  "movdqa     %%xmm10,%%xmm11                  \n"
  "movdqu     (%0),%%xmm4                      \n"
  "palignr    $0x8,%%xmm3,%%xmm3               \n"
  "palignr    $0x8,%%xmm11,%%xmm11             \n"
  "movdqu     (%0,%3),%%xmm5                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm4,%%xmm12                   \n"
  "punpcklbw  %%xmm5,%%xmm4                    \n"
  "punpckhbw  %%xmm5,%%xmm12                   \n"
  "movdqa     %%xmm4,%%xmm5                    \n"
  "movdqa     %%xmm12,%%xmm13                  \n"
  "movdqu     (%0),%%xmm6                      \n"
  "palignr    $0x8,%%xmm5,%%xmm5               \n"
  "palignr    $0x8,%%xmm13,%%xmm13             \n"
  "movdqu     (%0,%3),%%xmm7                   \n"
  "lea        (%0,%3,2),%0                     \n"
  "movdqa     %%xmm6,%%xmm14                   \n"
  "punpcklbw  %%xmm7,%%xmm6                    \n"
  "punpckhbw  %%xmm7,%%xmm14                   \n"
  "neg        %3                               \n"
  "movdqa     %%xmm6,%%xmm7                    \n"
  "movdqa     %%xmm14,%%xmm15                  \n"
  "lea        0x10(%0,%3,8),%0                 \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  "neg        %3                               \n"
   // Second round of bit swap.
  "punpcklwd  %%xmm2,%%xmm0                    \n"
  "punpcklwd  %%xmm3,%%xmm1                    \n"
  "movdqa     %%xmm0,%%xmm2                    \n"
  "movdqa     %%xmm1,%%xmm3                    \n"
  "palignr    $0x8,%%xmm2,%%xmm2               \n"
  "palignr    $0x8,%%xmm3,%%xmm3               \n"
  "punpcklwd  %%xmm6,%%xmm4                    \n"
  "punpcklwd  %%xmm7,%%xmm5                    \n"
  "movdqa     %%xmm4,%%xmm6                    \n"
  "movdqa     %%xmm5,%%xmm7                    \n"
  "palignr    $0x8,%%xmm6,%%xmm6               \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "punpcklwd  %%xmm10,%%xmm8                   \n"
  "punpcklwd  %%xmm11,%%xmm9                   \n"
  "movdqa     %%xmm8,%%xmm10                   \n"
  "movdqa     %%xmm9,%%xmm11                   \n"
  "palignr    $0x8,%%xmm10,%%xmm10             \n"
  "palignr    $0x8,%%xmm11,%%xmm11             \n"
  "punpcklwd  %%xmm14,%%xmm12                  \n"
  "punpcklwd  %%xmm15,%%xmm13                  \n"
  "movdqa     %%xmm12,%%xmm14                  \n"
  "movdqa     %%xmm13,%%xmm15                  \n"
  "palignr    $0x8,%%xmm14,%%xmm14             \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  // Third round of bit swap.
  // Write to the destination pointer.
  "punpckldq  %%xmm4,%%xmm0                    \n"
  "movq       %%xmm0,(%1)                      \n"
  "movdqa     %%xmm0,%%xmm4                    \n"
  "palignr    $0x8,%%xmm4,%%xmm4               \n"
  "movq       %%xmm4,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm6,%%xmm2                    \n"
  "movdqa     %%xmm2,%%xmm6                    \n"
  "movq       %%xmm2,(%1)                      \n"
  "palignr    $0x8,%%xmm6,%%xmm6               \n"
  "punpckldq  %%xmm5,%%xmm1                    \n"
  "movq       %%xmm6,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "movdqa     %%xmm1,%%xmm5                    \n"
  "movq       %%xmm1,(%1)                      \n"
  "palignr    $0x8,%%xmm5,%%xmm5               \n"
  "movq       %%xmm5,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm7,%%xmm3                    \n"
  "movq       %%xmm3,(%1)                      \n"
  "movdqa     %%xmm3,%%xmm7                    \n"
  "palignr    $0x8,%%xmm7,%%xmm7               \n"
  "movq       %%xmm7,(%1,%4)                   \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm12,%%xmm8                   \n"
  "movq       %%xmm8,(%1)                      \n"
  "movdqa     %%xmm8,%%xmm12                   \n"
  "palignr    $0x8,%%xmm12,%%xmm12             \n"
  "movq       %%xmm12,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm14,%%xmm10                  \n"
  "movdqa     %%xmm10,%%xmm14                  \n"
  "movq       %%xmm10,(%1)                     \n"
  "palignr    $0x8,%%xmm14,%%xmm14             \n"
  "punpckldq  %%xmm13,%%xmm9                   \n"
  "movq       %%xmm14,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "movdqa     %%xmm9,%%xmm13                   \n"
  "movq       %%xmm9,(%1)                      \n"
  "palignr    $0x8,%%xmm13,%%xmm13             \n"
  "movq       %%xmm13,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "punpckldq  %%xmm15,%%xmm11                  \n"
  "movq       %%xmm11,(%1)                     \n"
  "movdqa     %%xmm11,%%xmm15                  \n"
  "palignr    $0x8,%%xmm15,%%xmm15             \n"
  "sub        $0x10,%2                         \n"
  "movq       %%xmm15,(%1,%4)                  \n"
  "lea        (%1,%4,2),%1                     \n"
  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "r"((intptr_t)(src_stride)),  // %3
    "r"((intptr_t)(dst_stride))   // %4
  : "memory", "cc",
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
 );
 }
 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b, int width) {
  asm volatile (
  // Read in the data from the source pointer.
  // First round of bit swap.
  ".p2align  2                                 \n"
 "1:                                            \n"
  "movdqu     (%0),%%xmm0                      \n"
  "movdqu     (%0,%4),%%xmm1                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpcklbw  %%xmm1,%%xmm0                    \n"
  "punpckhbw  %%xmm1,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm1                    \n"
  "movdqu     (%0),%%xmm2                      \n"
  "movdqu     (%0,%4),%%xmm3                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm2,%%xmm8                    \n"
  "punpcklbw  %%xmm3,%%xmm2                    \n"
  "punpckhbw  %%xmm3,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm3                    \n"
  "movdqu     (%0),%%xmm4                      \n"
  "movdqu     (%0,%4),%%xmm5                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm4,%%xmm8                    \n"
  "punpcklbw  %%xmm5,%%xmm4                    \n"
  "punpckhbw  %%xmm5,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm5                    \n"
  "movdqu     (%0),%%xmm6                      \n"
  "movdqu     (%0,%4),%%xmm7                   \n"
  "lea        (%0,%4,2),%0                     \n"
  "movdqa     %%xmm6,%%xmm8                    \n"
  "punpcklbw  %%xmm7,%%xmm6                    \n"
  "neg        %4                               \n"
  "lea        0x10(%0,%4,8),%0                 \n"
  "punpckhbw  %%xmm7,%%xmm8                    \n"
  "movdqa     %%xmm8,%%xmm7                    \n"
  "neg        %4                               \n"
   // Second round of bit swap.
  "movdqa     %%xmm0,%%xmm8                    \n"
  "movdqa     %%xmm1,%%xmm9                    \n"
  "punpckhwd  %%xmm2,%%xmm8                    \n"
  "punpckhwd  %%xmm3,%%xmm9                    \n"
  "punpcklwd  %%xmm2,%%xmm0                    \n"
  "punpcklwd  %%xmm3,%%xmm1                    \n"
  "movdqa     %%xmm8,%%xmm2                    \n"
  "movdqa     %%xmm9,%%xmm3                    \n"
  "movdqa     %%xmm4,%%xmm8                    \n"
  "movdqa     %%xmm5,%%xmm9                    \n"
  "punpckhwd  %%xmm6,%%xmm8                    \n"
  "punpckhwd  %%xmm7,%%xmm9                    \n"
  "punpcklwd  %%xmm6,%%xmm4                    \n"
  "punpcklwd  %%xmm7,%%xmm5                    \n"
  "movdqa     %%xmm8,%%xmm6                    \n"
  "movdqa     %%xmm9,%%xmm7                    \n"
  // Third round of bit swap.
  // Write to the destination pointer.
  "movdqa     %%xmm0,%%xmm8                    \n"
  "punpckldq  %%xmm4,%%xmm0                    \n"
  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
  "punpckhdq  %%xmm4,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm2,%%xmm8                    \n"
  "punpckldq  %%xmm6,%%xmm2                    \n"
  "movlpd     %%xmm2,(%1)                      \n"
  "movhpd     %%xmm2,(%2)                      \n"
  "punpckhdq  %%xmm6,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm1,%%xmm8                    \n"
  "punpckldq  %%xmm5,%%xmm1                    \n"
  "movlpd     %%xmm1,(%1)                      \n"
  "movhpd     %%xmm1,(%2)                      \n"
  "punpckhdq  %%xmm5,%%xmm8                    \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "movdqa     %%xmm3,%%xmm8                    \n"
  "punpckldq  %%xmm7,%%xmm3                    \n"
  "movlpd     %%xmm3,(%1)                      \n"
  "movhpd     %%xmm3,(%2)                      \n"
  "punpckhdq  %%xmm7,%%xmm8                    \n"
  "sub        $0x8,%3                          \n"
  "movlpd     %%xmm8,(%1,%5)                   \n"
  "lea        (%1,%5,2),%1                     \n"
  "movhpd     %%xmm8,(%2,%6)                   \n"
  "lea        (%2,%6,2),%2                     \n"
  "jg         1b                               \n"
  : "+r"(src),    // %0
    "+r"(dst_a),  // %1
    "+r"(dst_b),  // %2
    "+r"(width)   // %3
  : "r"((intptr_t)(src_stride)),    // %4
    "r"((intptr_t)(dst_stride_a)),  // %5
    "r"((intptr_t)(dst_stride_b))   // %6
  : "memory", "cc",
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    "xmm8", "xmm9"
 );
 }
 #endif
 #endif
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/third_party/libyuv/source/rotate_mips.cc
+++ b/third_party/libyuv/source/rotate_mips.cc
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/basic_types.h"
@@ -22,8 +23,7 @@ extern "C" {
    (_MIPS_SIM == _MIPS_SIM_ABI32)
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride,
+                             uint8* dst, int dst_stride, int width) {
                             int width) {
   __asm__ __volatile__ (
      ".set push                                         \n"
      ".set noreorder                                    \n"
@@ -106,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
  );
 }
-void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
-                                  uint8* dst, int dst_stride,
+                                  uint8* dst, int dst_stride, int width) {
                                  int width) {
  __asm__ __volatile__ (
      ".set noat                                         \n"
      ".set push                                         \n"
--- a/third_party/libyuv/source/rotate_neon.cc
+++ b/third_party/libyuv/source/rotate_neon.cc
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/basic_types.h"
--- a/third_party/libyuv/source/rotate_neon64.cc
+++ b/third_party/libyuv/source/rotate_neon64.cc
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/basic_types.h"
@@ -21,11 +22,10 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
 void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+                       uint8* dst, int dst_stride, int width) {
                       int width) {
  const uint8* src_temp = NULL;
  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
  asm volatile (
--- a/third_party/libyuv/source/rotate_win.cc
+++ b/third_party/libyuv/source/rotate_win.cc
@@ -0,0 +1,248 @@
 /*
 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // This module is for Visual C x86.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    defined(_MSC_VER) && !defined(__clang__)
 __declspec(naked)
 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                        uint8* dst, int dst_stride, int width) {
  __asm {
    push      edi
    push      esi
    push      ebp
    mov       eax, [esp + 12 + 4]   // src
    mov       edi, [esp + 12 + 8]   // src_stride
    mov       edx, [esp + 12 + 12]  // dst
    mov       esi, [esp + 12 + 16]  // dst_stride
    mov       ecx, [esp + 12 + 20]  // width
    // Read in the data from the source pointer.
    // First round of bit swap.
    align      4
 convertloop:
    movq      xmm0, qword ptr [eax]
    lea       ebp, [eax + 8]
    movq      xmm1, qword ptr [eax + edi]
    lea       eax, [eax + 2 * edi]
    punpcklbw xmm0, xmm1
    movq      xmm2, qword ptr [eax]
    movdqa    xmm1, xmm0
    palignr   xmm1, xmm1, 8
    movq      xmm3, qword ptr [eax + edi]
    lea       eax, [eax + 2 * edi]
    punpcklbw xmm2, xmm3
    movdqa    xmm3, xmm2
    movq      xmm4, qword ptr [eax]
    palignr   xmm3, xmm3, 8
    movq      xmm5, qword ptr [eax + edi]
    punpcklbw xmm4, xmm5
    lea       eax, [eax + 2 * edi]
    movdqa    xmm5, xmm4
    movq      xmm6, qword ptr [eax]
    palignr   xmm5, xmm5, 8
    movq      xmm7, qword ptr [eax + edi]
    punpcklbw xmm6, xmm7
    mov       eax, ebp
    movdqa    xmm7, xmm6
    palignr   xmm7, xmm7, 8
    // Second round of bit swap.
    punpcklwd xmm0, xmm2
    punpcklwd xmm1, xmm3
    movdqa    xmm2, xmm0
    movdqa    xmm3, xmm1
    palignr   xmm2, xmm2, 8
    palignr   xmm3, xmm3, 8
    punpcklwd xmm4, xmm6
    punpcklwd xmm5, xmm7
    movdqa    xmm6, xmm4
    movdqa    xmm7, xmm5
    palignr   xmm6, xmm6, 8
    palignr   xmm7, xmm7, 8
    // Third round of bit swap.
    // Write to the destination pointer.
    punpckldq xmm0, xmm4
    movq      qword ptr [edx], xmm0
    movdqa    xmm4, xmm0
    palignr   xmm4, xmm4, 8
    movq      qword ptr [edx + esi], xmm4
    lea       edx, [edx + 2 * esi]
    punpckldq xmm2, xmm6
    movdqa    xmm6, xmm2
    palignr   xmm6, xmm6, 8
    movq      qword ptr [edx], xmm2
    punpckldq xmm1, xmm5
    movq      qword ptr [edx + esi], xmm6
    lea       edx, [edx + 2 * esi]
    movdqa    xmm5, xmm1
    movq      qword ptr [edx], xmm1
    palignr   xmm5, xmm5, 8
    punpckldq xmm3, xmm7
    movq      qword ptr [edx + esi], xmm5
    lea       edx, [edx + 2 * esi]
    movq      qword ptr [edx], xmm3
    movdqa    xmm7, xmm3
    palignr   xmm7, xmm7, 8
    sub       ecx, 8
    movq      qword ptr [edx + esi], xmm7
    lea       edx, [edx + 2 * esi]
    jg        convertloop
    pop       ebp
    pop       esi
    pop       edi
    ret
  }
 }
 __declspec(naked)
 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                         uint8* dst_a, int dst_stride_a,
                         uint8* dst_b, int dst_stride_b,
                         int w) {
  __asm {
    push      ebx
    push      esi
    push      edi
    push      ebp
    mov       eax, [esp + 16 + 4]   // src
    mov       edi, [esp + 16 + 8]   // src_stride
    mov       edx, [esp + 16 + 12]  // dst_a
    mov       esi, [esp + 16 + 16]  // dst_stride_a
    mov       ebx, [esp + 16 + 20]  // dst_b
    mov       ebp, [esp + 16 + 24]  // dst_stride_b
    mov       ecx, esp
    sub       esp, 4 + 16
    and       esp, ~15
    mov       [esp + 16], ecx
    mov       ecx, [ecx + 16 + 28]  // w
    align      4
 convertloop:
    // Read in the data from the source pointer.
    // First round of bit swap.
    movdqu    xmm0, [eax]
    movdqu    xmm1, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm0  // use xmm7 as temp register.
    punpcklbw xmm0, xmm1
    punpckhbw xmm7, xmm1
    movdqa    xmm1, xmm7
    movdqu    xmm2, [eax]
    movdqu    xmm3, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm2
    punpcklbw xmm2, xmm3
    punpckhbw xmm7, xmm3
    movdqa    xmm3, xmm7
    movdqu    xmm4, [eax]
    movdqu    xmm5, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqa    xmm7, xmm4
    punpcklbw xmm4, xmm5
    punpckhbw xmm7, xmm5
    movdqa    xmm5, xmm7
    movdqu    xmm6, [eax]
    movdqu    xmm7, [eax + edi]
    lea       eax, [eax + 2 * edi]
    movdqu    [esp], xmm5  // backup xmm5
    neg       edi
    movdqa    xmm5, xmm6   // use xmm5 as temp register.
    punpcklbw xmm6, xmm7
    punpckhbw xmm5, xmm7
    movdqa    xmm7, xmm5
    lea       eax, [eax + 8 * edi + 16]
    neg       edi
    // Second round of bit swap.
    movdqa    xmm5, xmm0
    punpcklwd xmm0, xmm2
    punpckhwd xmm5, xmm2
    movdqa    xmm2, xmm5
    movdqa    xmm5, xmm1
    punpcklwd xmm1, xmm3
    punpckhwd xmm5, xmm3
    movdqa    xmm3, xmm5
    movdqa    xmm5, xmm4
    punpcklwd xmm4, xmm6
    punpckhwd xmm5, xmm6
    movdqa    xmm6, xmm5
    movdqu    xmm5, [esp]  // restore xmm5
    movdqu    [esp], xmm6  // backup xmm6
    movdqa    xmm6, xmm5    // use xmm6 as temp register.
    punpcklwd xmm5, xmm7
    punpckhwd xmm6, xmm7
    movdqa    xmm7, xmm6
    // Third round of bit swap.
    // Write to the destination pointer.
    movdqa    xmm6, xmm0
    punpckldq xmm0, xmm4
    punpckhdq xmm6, xmm4
    movdqa    xmm4, xmm6
    movdqu    xmm6, [esp]  // restore xmm6
    movlpd    qword ptr [edx], xmm0
    movhpd    qword ptr [ebx], xmm0
    movlpd    qword ptr [edx + esi], xmm4
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm4
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    punpckldq xmm2, xmm6
    movlpd    qword ptr [edx], xmm2
    movhpd    qword ptr [ebx], xmm2
    punpckhdq xmm0, xmm6
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    punpckldq xmm1, xmm5
    movlpd    qword ptr [edx], xmm1
    movhpd    qword ptr [ebx], xmm1
    punpckhdq xmm0, xmm5
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    punpckldq xmm3, xmm7
    movlpd    qword ptr [edx], xmm3
    movhpd    qword ptr [ebx], xmm3
    punpckhdq xmm0, xmm7
    sub       ecx, 8
    movlpd    qword ptr [edx + esi], xmm0
    lea       edx, [edx + 2 * esi]
    movhpd    qword ptr [ebx + ebp], xmm0
    lea       ebx, [ebx + 2 * ebp]
    jg        convertloop
    mov       esp, [esp + 16]
    pop       ebp
    pop       edi
    pop       esi
    pop       ebx
    ret
  }
 }
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -199,28 +199,36 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
  }
 }
 // dither4 is a row of 4 values from 4x4 dither matrix.
 // The 4x4 matrix contains values to increase RGB.  When converting to
 // fewer bits (565) this provides an ordered dither.
 // The order in the 4x4 matrix in first byte is upper left.
 // The 4 values are passed as an int, then referenced as an array, so
 // endian will not affect order of the original matrix.  But the dither4
 // will containing the first pixel in the lower byte for little endian
 // or the upper byte for big endian.
 void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint8* dither8x8, int width) {
+                             const uint32 dither4, int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    int dither0 = dither8x8[x & 7] - 128;
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
-    int dither1 = dither8x8[(x & 7) + 1] - 128;
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
              (b1 << 16) | (g1 << 21) | (r1 << 27));
    dst_rgb += 4;
    src_argb += 8;
  }
  if (width & 1) {
-    int dither0 = dither8x8[(width - 1) & 7] - 128;
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
  }
 }
@@ -974,7 +982,7 @@ void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
  }
 }
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  // Copy a Y to RGB.
  int x;
  for (x = 0; x < width; ++x) {
@@ -986,38 +994,42 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  }
 }
-// YUV to RGB conversion constants.
+// BT.601 YUV to RGB reference
 //  R = (Y - 16) * 1.164              - V * -1.596
 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
 //  B = (Y - 16) * 1.164 - U * -2.018
 // Y contribution to R,G,B.  Scale and bias.
 // TODO(fbarchard): Consider moving constants into a common header.
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 // U and V contributions to R,G,B.
-#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* -round(-0.391 * 64) */
+#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* -round(-0.813 * 64) */
+#define VG 52 /* round(0.813 * 64) */
-#define VR -102 /* -round(1.596 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            - YGB)
+#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 - YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 - YGB)
+#define BR (VR * 128 + YGB)
 // C reference code that mimics the YUV assembly.
 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
                              uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(BB - (         u * UB) + y1) >> 6);
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
-  *g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
+  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
-  *r = Clamp((int32)(BR - (v * VR         ) + y1) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
 }
 // C reference code that mimics the YUV assembly.
 static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(y1 - YGB) >> 6);
+  *b = Clamp((int32)(y1 + YGB) >> 6);
-  *g = Clamp((int32)(y1 - YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
-  *r = Clamp((int32)(y1 - YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
 }
 #undef YG
@@ -1030,6 +1042,46 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
 #undef BG
 #undef BR
 // JPEG YUV to RGB reference
 // *  R = Y                - V * -1.40200
 // *  G = Y - U *  0.34414 - V *  0.71414
 // *  B = Y - U * -1.77200
 // Y contribution to R,G,B.  Scale and bias.
 // TODO(fbarchard): Consider moving constants into a common header.
 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
 #define YGBJ 32  /* 64 / 2 */
 // U and V contributions to R,G,B.
 #define UBJ -113 /* round(-1.77200 * 64) */
 #define UGJ 22 /* round(0.34414 * 64) */
 #define VGJ 46 /* round(0.71414  * 64) */
 #define VRJ -90 /* round(-1.40200 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
 #define BBJ (UBJ * 128 + YGBJ)
 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
 #define BRJ (VRJ * 128 + YGBJ)
 // C reference code that mimics the YUV assembly.
 static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
                               uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
 }
 #undef YGJ
 #undef YGBJ
 #undef UBJ
 #undef UGJ
 #undef VGJ
 #undef VRJ
 #undef BBJ
 #undef BGJ
 #undef BRJ
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
@@ -1102,34 +1154,6 @@ void I422ToARGBRow_C(const uint8* src_y,
  }
 }
 // C reference code that mimics the YUV assembly.
 // *  R = Y                + 1.40200 * Cr
 // *  G = Y - 0.34414 * Cb - 0.71414 * Cr
 // *  B = Y + 1.77200 * Cb
 #define YGJ 64 /* (int8)round(1.000 * 64) */
 #define UBJ 113 /* (int8)round(1.772 * 64) */
 #define UGJ -22 /* (int8)round(-0.34414 * 64) */
 #define URJ 0
 #define VBJ 0
 #define VGJ -46 /* (int8)round(-0.71414 * 64) */
 #define VRJ 90 /* (int8)round(1.402 * 64) */
 // Bias
 #define BBJ (UBJ * 128 + VBJ * 128)
 #define BGJ (UGJ * 128 + VGJ * 128)
 #define BRJ (URJ * 128 + VRJ * 128)
 static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
                              uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * YGJ);
  *b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
  *g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
  *r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
 }
 void J422ToARGBRow_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
@@ -1354,23 +1378,23 @@ void I411ToARGBRow_C(const uint8* src_y,
 }
 void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* usrc_v,
+                     const uint8* src_uv,
                     uint8* rgb_buf,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
    rgb_buf[7] = 255;
    src_y += 2;
-    usrc_v += 2;
+    src_uv += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
    rgb_buf[3] = 255;
  }
@@ -1402,7 +1426,7 @@ void NV21ToARGBRow_C(const uint8* src_y,
 }
 void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* usrc_v,
+                       const uint8* src_uv,
                       uint8* dst_rgb565,
                       int width) {
  uint8 b0;
@@ -1413,8 +1437,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
  uint8 r1;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
-    YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@@ -1424,11 +1448,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
        (b1 << 16) | (g1 << 21) | (r1 << 27);
    src_y += 2;
-    usrc_v += 2;
+    src_uv += 2;
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@@ -1588,7 +1612,7 @@ void I422ToRGBARow_C(const uint8* src_y,
  }
 }
-void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@@ -2062,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
  }
 }
 // Select G channel from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_C(const uint8* src_argb,
                        uint8* dst_bayer, uint32 selector, int pix) {
  // Copy a row of G.
  int x;
  for (x = 0; x < pix - 1; x += 2) {
    dst_bayer[0] = src_argb[1];
    dst_bayer[1] = src_argb[5];
    src_argb += 8;
    dst_bayer += 2;
  }
  if (pix & 1) {
    dst_bayer[0] = src_argb[1];
  }
 }
 // Use first 4 shuffler values to reorder ARGB channels.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
                      const uint8* shuffler, int pix) {
@@ -2120,7 +2128,7 @@ void I422ToYUY2Row_C(const uint8* src_y,
  if (width & 1) {
    dst_frame[0] = src_y[0];
    dst_frame[1] = src_u[0];
-    dst_frame[2] = src_y[0];  // duplicate last y
+    dst_frame[2] = 0;
    dst_frame[3] = src_v[0];
  }
 }
@@ -2144,14 +2152,15 @@ void I422ToUYVYRow_C(const uint8* src_y,
    dst_frame[0] = src_u[0];
    dst_frame[1] = src_y[0];
    dst_frame[2] = src_v[0];
-    dst_frame[3] = src_y[0];  // duplicate last y
+    dst_frame[3] = 0;
  }
 }
 // Maximum temporary width for wrappers to process at a time, in pixels.
 #define MAXTWIDTH 2048
-#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
    defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                           const uint8* src_u,
@@ -2346,6 +2355,50 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
 }
 #endif
 #if defined(HAS_I422TORGB24ROW_AVX2)
 void I422ToRGB24Row_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_rgb24,
                            int width) {
  // Row buffer for intermediate ARGB pixels.
  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
    // TODO(fbarchard): ARGBToRGB24Row_AVX2
    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
    dst_rgb24 += twidth * 3;
    width -= twidth;
  }
 }
 #endif
 #if defined(HAS_I422TORAWROW_AVX2)
 void I422ToRAWRow_AVX2(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_raw,
                            int width) {
  // Row buffer for intermediate ARGB pixels.
  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
  while (width > 0) {
    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
    // TODO(fbarchard): ARGBToRAWRow_AVX2
    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
    src_y += twidth;
    src_u += twidth / 2;
    src_v += twidth / 2;
    dst_raw += twidth * 3;
    width -= twidth;
  }
 }
 #endif
 #if defined(HAS_NV12TORGB565ROW_AVX2)
 void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
                          uint8* dst_rgb565, int width) {
--- a/third_party/libyuv/source/row_posix.cc
+++ b/third_party/libyuv/source/row_posix.cc
@@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
 }
 #endif  // TESTING
-#ifdef HAS_I400TOARGBROW_SSE2
+#ifdef HAS_J400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "pslld     $0x18,%%xmm5                    \n"
@@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
  );
 }
-#endif  // HAS_I400TOARGBROW_SSE2
+#endif  // HAS_J400TOARGBROW_SSE2
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
@@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOUVROW_AVX2
 #ifdef HAS_ARGBTOUVJROW_SSSE3
 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
  asm volatile (
@@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 // YUV to RGB conversion constants.
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
 // U and V contributions to R,G,B.
 #define UB -128 /* -min(128, round(2.018 * 64)) */
 #define UG 25 /* -round(-0.391 * 64) */
 #define VG 52 /* -round(-0.813 * 64) */
 #define VR -102 /* -round(1.596 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
 #define BB (UB * 128            - YGB)
 #define BG (UG * 128 + VG * 128 - YGB)
 #define BR            (VR * 128 - YGB)
 struct YuvConstants {
  lvec8 kUVToB;     // 0
  lvec8 kUVToG;     // 32
@@ -1440,6 +1423,27 @@ struct YuvConstants {
  lvec16 kYToRgb;   // 192
 };
 // BT.601 YUV to RGB reference
 //  R = (Y - 16) * 1.164              - V * -1.596
 //  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
 //  B = (Y - 16) * 1.164 - U * -2.018
 // Y contribution to R,G,B.  Scale and bias.
 // TODO(fbarchard): Consider moving constants into a common header.
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 // U and V contributions to R,G,B.
 #define UB -128 /* max(-128, round(-2.018 * 64)) */
 #define UG 25 /* round(0.391 * 64) */
 #define VG 52 /* round(0.813 * 64) */
 #define VR -102 /* round(-1.596 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
 #define BB (UB * 128            + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
 #define BR            (VR * 128 + YGB)
 // BT601 constants for YUV to RGB.
 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 #undef YG
 #undef YGB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
 #undef BB
 #undef BG
 #undef BR
 // JPEG YUV to RGB reference
 // *  R = Y                - V * -1.40200
 // *  G = Y - U *  0.34414 - V *  0.71414
 // *  B = Y - U * -1.77200
 // Y contribution to R,G,B.  Scale and bias.
 // TODO(fbarchard): Consider moving constants into a common header.
 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
 #define YGBJ 32  /* 64 / 2 */
 // U and V contributions to R,G,B.
 #define UBJ -113 /* round(-1.77200 * 64) */
 #define UGJ 22 /* round(0.34414 * 64) */
 #define VGJ 46 /* round(0.71414  * 64) */
 #define VRJ -90 /* round(-1.40200 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
 #define BBJ (UBJ * 128             + YGBJ)
 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
 #define BRJ             (VRJ * 128 + YGBJ)
 // JPEG constants for YUV to RGB.
 YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
 };
 #undef YGJ
 #undef YGBJ
 #undef UBJ
 #undef UGJ
 #undef VGJ
 #undef VRJ
 #undef BBJ
 #undef BGJ
 #undef BRJ
 // Read 8 UV from 411
 #define READYUV444                                                             \
    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
@@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10,[dst_argb]) "           \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]          \n"
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
 // Store 8 BGRA values. Assumes XMM5 is zero.
 #define STOREBGRA                                                              \
@@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "            \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra]           \n"
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
 // Store 8 ABGR values. Assumes XMM5 is zero.
 #define STOREABGR                                                              \
@@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "            \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr]           \n"
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
 // Store 8 RGBA values. Assumes XMM5 is zero.
 #define STORERGBA                                                              \
@@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "            \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba]           \n"
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
@@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
  );
 }
 void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
                                uint8* dst_argb,
                                int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    LABELALIGN
  "1:                                          \n"
    READYUV422
    YUVTORGB(kYuvConstants)
    STOREARGB
    "sub       $0x8,%[width]                   \n"
    "jg        1b                              \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  );
 }
 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
@@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm2         \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm1,%%ymm2,%%ymm1                               \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm1          \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
-    "vpsubw      %%ymm0,%%ymm1,%%ymm0                               \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
@@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I422TOARGBROW_AVX2
 #if defined(HAS_J422TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* dst_argb,
                               int width) {
  asm volatile (
    "sub       %[u_buf],%[v_buf]               \n"
    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
    LABELALIGN
  "1:                                          \n"
    READYUV422_AVX2
    YUVTORGB_AVX2(kYuvConstants)
    // Step 3: Weave into ARGB
    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
    "sub       $0x10,%[width]                  \n"
    "jg        1b                              \n"
    "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
  );
 }
 #endif  // HAS_J422TOARGBROW_AVX2
 #if defined(HAS_I422TOABGRROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
@@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I422TORGBAROW_AVX2
-#ifdef HAS_YTOARGBROW_SSE2
+#ifdef HAS_I400TOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
  asm volatile (
    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
    "movd      %%eax,%%xmm2                    \n"
@@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  );
 }
-#endif  // HAS_YTOARGBROW_SSE2
+#endif  // HAS_I400TOARGBROW_SSE2
-#ifdef HAS_YTOARGBROW_AVX2
+#ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
  asm volatile (
    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
    "vmovd      %%eax,%%xmm2                   \n"
@@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
  );
 }
-#endif  // HAS_YTOARGBROW_AVX2
+#endif  // HAS_I400TOARGBROW_AVX2
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
@@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    "psllw     $0x8,%%xmm5                     \n"
    "pcmpeqb   %%xmm4,%%xmm4                   \n"
    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
+    "sub       $0x4,%3                         \n"
    "je        91f                             \n"
    "jl        99f                             \n"
    // 1 pixel loop until destination pointer is aligned.
  "10:                                         \n"
    "test      $0xf,%2                         \n"
    "je        19f                             \n"
    "movd      " MEMACCESS(0) ",%%xmm3         \n"
    "lea       " MEMLEA(0x4,0) ",%0            \n"
    "movdqa    %%xmm3,%%xmm0                   \n"
    "pxor      %%xmm4,%%xmm3                   \n"
    "movd      " MEMACCESS(1) ",%%xmm2         \n"
    "psrlw     $0x8,%%xmm3                     \n"
    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
    "pand      %%xmm6,%%xmm2                   \n"
    "paddw     %%xmm7,%%xmm3                   \n"
    "pmullw    %%xmm3,%%xmm2                   \n"
    "movd      " MEMACCESS(1) ",%%xmm1         \n"
    "lea       " MEMLEA(0x4,1) ",%1            \n"
    "psrlw     $0x8,%%xmm1                     \n"
    "por       %%xmm4,%%xmm0                   \n"
    "pmullw    %%xmm3,%%xmm1                   \n"
    "psrlw     $0x8,%%xmm2                     \n"
    "paddusb   %%xmm2,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
    "paddusb   %%xmm1,%%xmm0                   \n"
    "movd      %%xmm0," MEMACCESS(2) "         \n"
    "lea       " MEMLEA(0x4,2) ",%2            \n"
    "sub       $0x1,%3                         \n"
    "jge       10b                             \n"
  "19:                                         \n"
    "add       $1-4,%3                         \n"
    "jl        49f                             \n"
    // 4 pixel loop.
@@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    "psllw     $0x8,%%xmm5                     \n"
    "pcmpeqb   %%xmm4,%%xmm4                   \n"
    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x1,%3                         \n"
+    "sub       $0x4,%3                         \n"
    "je        91f                             \n"
    "jl        99f                             \n"
    // 1 pixel loop until destination pointer is aligned.
  "10:                                         \n"
    "test      $0xf,%2                         \n"
    "je        19f                             \n"
    "movd      " MEMACCESS(0) ",%%xmm3         \n"
    "lea       " MEMLEA(0x4,0) ",%0            \n"
    "movdqa    %%xmm3,%%xmm0                   \n"
    "pxor      %%xmm4,%%xmm3                   \n"
    "movd      " MEMACCESS(1) ",%%xmm2         \n"
    "pshufb    %4,%%xmm3                       \n"
    "pand      %%xmm6,%%xmm2                   \n"
    "paddw     %%xmm7,%%xmm3                   \n"
    "pmullw    %%xmm3,%%xmm2                   \n"
    "movd      " MEMACCESS(1) ",%%xmm1         \n"
    "lea       " MEMLEA(0x4,1) ",%1            \n"
    "psrlw     $0x8,%%xmm1                     \n"
    "por       %%xmm4,%%xmm0                   \n"
    "pmullw    %%xmm3,%%xmm1                   \n"
    "psrlw     $0x8,%%xmm2                     \n"
    "paddusb   %%xmm2,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
    "paddusb   %%xmm1,%%xmm0                   \n"
    "movd      %%xmm0," MEMACCESS(2) "         \n"
    "lea       " MEMLEA(0x4,2) ",%2            \n"
    "sub       $0x1,%3                         \n"
    "jge       10b                             \n"
  "19:                                         \n"
    "add       $1-4,%3                         \n"
    "jl        49f                             \n"
    // 4 pixel loop.
@@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrld     $0x18,%%xmm5                    \n"
    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "psrld     $0x8,%%xmm0                     \n"
    "psrld     $0x8,%%xmm1                     \n"
    "pand      %%xmm5,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
    "packssdw  %%xmm1,%%xmm0                   \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "movq      %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x8,1) ",%1            \n"
    "sub       $0x8,%2                         \n"
    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_bayer), // %1
    "+r"(pix)        // %2
  :
  : "memory", "cc"
    , "xmm0", "xmm1", "xmm5"
  );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_SSE2
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
--- a/third_party/libyuv/source/row_neon.cc
+++ b/third_party/libyuv/source/row_neon.cc
@@ -94,11 +94,17 @@ extern "C" {
    "vtrn.u32   d2, d3                         \n"
 #define YUV422TORGB_SETUP_REG                                                  \
    MEMACCESS([kUVToRB])                                                       \
    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
    MEMACCESS([kUVToG])                                                        \
    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
    MEMACCESS([kUVBiasBGR])                                                    \
    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
    MEMACCESS([kUVBiasBGR])                                                    \
    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
    MEMACCESS([kUVBiasBGR])                                                    \
    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
    MEMACCESS([kYToRgb])                                                       \
    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
 #define YUV422TORGB                                                            \
@@ -186,7 +192,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -216,7 +222,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -246,7 +252,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -277,7 +283,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -308,7 +314,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -338,7 +344,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -367,7 +373,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -397,7 +403,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -439,7 +445,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -485,7 +491,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -526,14 +532,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %6
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
-void YToARGBRow_NEON(const uint8* src_y,
+void I400ToARGBRow_NEON(const uint8* src_y,
-                     uint8* dst_argb,
+                        uint8* dst_argb,
-                     int width) {
+                        int width) {
  asm volatile (
    YUV422TORGB_SETUP_REG
    ".p2align   2                              \n"
@@ -552,17 +558,17 @@ void YToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
-void I400ToARGBRow_NEON(const uint8* src_y,
+void J400ToARGBRow_NEON(const uint8* src_y,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
    ".p2align   2                              \n"
    "vmov.u8    d23, #255                      \n"
    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {d20}, [%0]!                   \n"
@@ -603,7 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -631,7 +637,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -659,7 +665,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -687,7 +693,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
      [kUVToG]"r"(&kUVToG),     // %5
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -713,7 +719,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -739,7 +745,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
      [kUVToG]"r"(&kUVToG),     // %4
      [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3",
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
@@ -1245,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  );
 }
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /*selector*/, int pix) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    MEMACCESS(1)
    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_bayer),  // %1
    "+r"(pix)         // %2
  :
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
 }
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int pix) {
@@ -1360,6 +1347,30 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
  );
 }
 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int width) {
  asm volatile (
    ".p2align   2                              \n"
    "vdup.32    d2, %2                         \n"  // dither4
  "1:                                          \n"
    MEMACCESS(1)
    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
    "vqadd.u8   d20, d20, d2                   \n"
    "vqadd.u8   d21, d21, d2                   \n"
    "vqadd.u8   d22, d22, d2                   \n"
    ARGBTORGB565
    MEMACCESS(0)
    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
    "bgt        1b                             \n"
  : "+r"(dst_rgb)    // %0
  : "r"(src_argb),   // %1
    "r"(dither4),    // %2
    "r"(width)       // %3
  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
  );
 }
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                            int pix) {
  asm volatile (
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV444
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                 \n"
    "movi       v23.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v23.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
@@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV411
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v23.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
@@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v21, v22, v23)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v20.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
@@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v20, v21, v22)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v23.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
@@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v23, v22, v21)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v20.8b, #255                   \n" /* A */
    MEMACCESS(3)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
@@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    MEMACCESS(3)
    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
    "b.gt       1b                             \n"
@@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v20, v21, v22)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    MEMACCESS(3)
    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
    "b.gt       1b                             \n"
@@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    ARGBTORGB565
    MEMACCESS(3)
    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
@@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v23.8b, #255                   \n"
    ARGBTOARGB1555
    MEMACCESS(3)
@@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READYUV422
    YUV422TORGB(v22, v21, v20)
-    "subs       %4, %4, #8                     \n"
+    "subs       %w4, %w4, #8                   \n"
    "movi       v23.8b, #255                   \n"
    ARGBTOARGB4444
    MEMACCESS(3)
@@ -513,33 +513,34 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
 }
 #endif  // HAS_I422TOARGB4444ROW_NEON
-#ifdef HAS_YTOARGBROW_NEON
+#ifdef HAS_I400TOARGBROW_NEON
-void YToARGBRow_NEON(const uint8* src_y,
+void I400ToARGBRow_NEON(const uint8* src_y,
-                     uint8* dst_argb,
+                        uint8* dst_argb,
-                     int width) {
+                        int width) {
  int64 width64 = (int64)(width);
  asm volatile (
    YUV422TORGB_SETUP_REG
  "1:                                          \n"
    READYUV400
    YUV422TORGB(v22, v21, v20)
-    "subs       %2, %2, #8                     \n"
+    "subs       %w2, %w2, #8                   \n"
    "movi       v23.8b, #255                   \n"
    MEMACCESS(1)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    "b.gt       1b                             \n"
    : "+r"(src_y),     // %0
      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
+      "+r"(width64)    // %2
    : [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
  );
 }
-#endif  // HAS_YTOARGBROW_NEON
+#endif  // HAS_I400TOARGBROW_NEON
-#ifdef HAS_I400TOARGBROW_NEON
+#ifdef HAS_J400TOARGBROW_NEON
-void I400ToARGBRow_NEON(const uint8* src_y,
+void J400ToARGBRow_NEON(const uint8* src_y,
                        uint8* dst_argb,
                        int width) {
  asm volatile (
@@ -549,7 +550,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
    "ld1        {v20.8b}, [%0], #8             \n"
    "orr        v21.8b, v20.8b, v20.8b         \n"
    "orr        v22.8b, v20.8b, v20.8b         \n"
-    "subs       %2, %2, #8                     \n"
+    "subs       %w2, %w2, #8                   \n"
    MEMACCESS(1)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
    "b.gt       1b                             \n"
@@ -560,7 +561,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
    : "cc", "memory", "v20", "v21", "v22", "v23"
  );
 }
-#endif  // HAS_I400TOARGBROW_NEON
+#endif  // HAS_J400TOARGBROW_NEON
 #ifdef HAS_NV12TOARGBROW_NEON
 void NV12ToARGBRow_NEON(const uint8* src_y,
@@ -572,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READNV12
    YUV422TORGB(v22, v21, v20)
-    "subs       %3, %3, #8                     \n"
+    "subs       %w3, %w3, #8                   \n"
    "movi       v23.8b, #255                   \n"
    MEMACCESS(2)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
@@ -599,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
  "1:                                          \n"
    READNV21
    YUV422TORGB(v22, v21, v20)
-    "subs       %3, %3, #8                     \n"
+    "subs       %w3, %w3, #8                   \n"
    "movi       v23.8b, #255                   \n"
    MEMACCESS(2)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
@@ -626,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READNV12
    YUV422TORGB(v22, v21, v20)
-    "subs       %3, %3, #8                     \n"
+    "subs       %w3, %w3, #8                   \n"
    ARGBTORGB565
    MEMACCESS(2)
    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
@@ -653,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
  "1:                                          \n"
    READNV21
    YUV422TORGB(v22, v21, v20)
-    "subs       %3, %3, #8                     \n"
+    "subs       %w3, %w3, #8                   \n"
    ARGBTORGB565
    MEMACCESS(2)
    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
@@ -674,19 +675,20 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                        uint8* dst_argb,
                        int width) {
  int64 width64 = (int64)(width);
  asm volatile (
    YUV422TORGB_SETUP_REG
  "1:                                          \n"
    READYUY2
    YUV422TORGB(v22, v21, v20)
-    "subs       %2, %2, #8                     \n"
+    "subs       %w2, %w2, #8                   \n"
    "movi       v23.8b, #255                   \n"
    MEMACCESS(1)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
    "b.gt       1b                             \n"
    : "+r"(src_yuy2),  // %0
      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
+      "+r"(width64)    // %2
    : [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@@ -699,19 +701,20 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                        uint8* dst_argb,
                        int width) {
  int64 width64 = (int64)(width);
  asm volatile (
    YUV422TORGB_SETUP_REG
  "1:                                          \n"
    READUYVY
    YUV422TORGB(v22, v21, v20)
-    "subs       %2, %2, #8                     \n"
+    "subs       %w2, %w2, #8                   \n"
    "movi       v23.8b, #255                   \n"
    MEMACCESS(1)
    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
    "b.gt       1b                             \n"
    : "+r"(src_uyvy),  // %0
      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
+      "+r"(width64)    // %2
    : [kUVBiasBGR]"r"(&kUVBiasBGR),
      [kYToRgb]"r"(&kYToRgb)
    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@@ -728,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  "1:                                          \n"
    MEMACCESS(0)
    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"  // store U
    MEMACCESS(2)
@@ -754,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    "ld1        {v0.16b}, [%0], #16            \n"  // load U
    MEMACCESS(1)
    "ld1        {v1.16b}, [%1], #16            \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    MEMACCESS(2)
    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
    "b.gt       1b                             \n"
@@ -776,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
    MEMACCESS(1)
    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
    "b.gt       1b                             \n"
@@ -794,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
  asm volatile (
    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
  "1:                                          \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"  // store
    "b.gt      1b                              \n"
@@ -809,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (
    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
  "1:                                          \n"
-    "subs      %1, %1, #4                      \n"  // 4 ints per loop
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"  // store
    "b.gt      1b                              \n"
@@ -822,6 +825,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
 #ifdef HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  int64 width64 = (int64) width;
  asm volatile (
    // Start at end of source row.
    "add        %0, %0, %2                     \n"
@@ -830,7 +834,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %2, %2, #16                    \n"  // 16 pixels per loop.
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
    "rev64      v0.16b, v0.16b                 \n"
    MEMACCESS(1)
    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
@@ -839,7 +843,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    "b.gt       1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
-    "+r"(width)  // %2
+    "+r"(width64)  // %2
  : "r"((ptrdiff_t)-16)    // %3
  : "cc", "memory", "v0"
  );
@@ -849,6 +853,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 #ifdef HAS_MIRRORUVROW_NEON
 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) {
  int64 width64 = (int64) width;
  asm volatile (
    // Start at end of source row.
    "add        %0, %0, %3, lsl #1             \n"
@@ -868,7 +873,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
  : "+r"(src_uv),  // %0
    "+r"(dst_u),   // %1
    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
+    "+r"(width64)    // %3
  : "r"((ptrdiff_t)-16)      // %4
  : "cc", "memory", "v0", "v1"
  );
@@ -877,6 +882,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #ifdef HAS_ARGBMIRRORROW_NEON
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
  int64 width64 = (int64) width;
  asm volatile (
    // Start at end of source row.
    "add        %0, %0, %2, lsl #2             \n"
@@ -894,7 +900,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    "b.gt       1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
-    "+r"(width)  // %2
+    "+r"(width64)  // %2
  : "r"((ptrdiff_t)-16)    // %3
  : "cc", "memory", "v0"
  );
@@ -908,7 +914,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    MEMACCESS(1)
    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
    "b.gt       1b                             \n"
@@ -928,7 +934,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
    MEMACCESS(1)
@@ -963,7 +969,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    RGB565TOARGB
    MEMACCESS(1)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
@@ -1022,7 +1028,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGB1555TOARGB
    MEMACCESS(1)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
@@ -1055,7 +1061,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGB4444TOARGB
    MEMACCESS(1)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
@@ -1075,7 +1081,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    MEMACCESS(1)
    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
    "b.gt       1b                             \n"
@@ -1094,7 +1100,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
    MEMACCESS(1)
@@ -1115,7 +1121,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
    "b.gt       1b                             \n"
@@ -1134,7 +1140,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
    MEMACCESS(1)
    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
    "b.gt       1b                             \n"
@@ -1154,7 +1160,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
    MEMACCESS(1)
    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
    MEMACCESS(2)
@@ -1177,7 +1183,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
    MEMACCESS(1)
    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
    MEMACCESS(2)
@@ -1201,7 +1207,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
@@ -1231,7 +1237,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
@@ -1253,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
 }
 #endif  // HAS_UYVYTOUVROW_NEON
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 #ifdef HAS_ARGBTOBAYERGGROW_NEON
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /*selector*/, int pix) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load row 8 pixels
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    MEMACCESS(1)
    "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
    "b.gt       1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_bayer),  // %1
    "+r"(pix)         // %2
  :
  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_NEON
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
@@ -1284,7 +1269,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
    MEMACCESS(1)
    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
@@ -1312,7 +1297,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
    MEMACCESS(2)
    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
    MEMACCESS(3)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
    "b.gt       1b                             \n"
@@ -1341,7 +1326,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
    MEMACCESS(2)
    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
    MEMACCESS(3)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
    "b.gt       1b                             \n"
@@ -1362,7 +1347,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGBTORGB565
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
@@ -1376,6 +1361,31 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
 }
 #endif  // HAS_ARGBTORGB565ROW_NEON
 #ifdef HAS_ARGBTORGB565DITHERROW_NEON
 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
                                const uint32 dither4, int width) {
  asm volatile (
    "dup        v1.4s, %w2                     \n"  // dither4
  "1:                                          \n"
    MEMACCESS(1)
    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uqadd      v20.8b, v20.8b, v1.8b          \n"
    "uqadd      v21.8b, v21.8b, v1.8b          \n"
    "uqadd      v22.8b, v22.8b, v1.8b          \n"
    ARGBTORGB565
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
    "b.gt       1b                             \n"
  : "+r"(dst_rgb)    // %0
  : "r"(src_argb),   // %1
    "r"(dither4),    // %2
    "r"(width)       // %3
  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
  );
 }
 #endif  // HAS_ARGBTORGB565ROW_NEON
 #ifdef HAS_ARGBTOARGB1555ROW_NEON
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                            int pix) {
@@ -1383,7 +1393,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGBTOARGB1555
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
@@ -1405,7 +1415,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGBTOARGB4444
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
@@ -1429,7 +1439,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
@@ -1456,7 +1466,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
@@ -1487,7 +1497,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
@@ -1531,7 +1541,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
@@ -1587,7 +1597,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
@@ -1653,7 +1663,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v1.8h, v2.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1700,7 +1710,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v1.8h, v2.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1741,7 +1751,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
    "urshr      v1.8h, v3.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v1.8h, v2.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1782,7 +1792,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
    "urshr      v2.8h, v2.8h, #1               \n"
    "urshr      v1.8h, v1.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v2.8h, v1.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1823,7 +1833,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v1.8h, v2.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1864,7 +1874,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v2.8h, v2.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v0.8h, v1.8h, v2.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1905,7 +1915,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
    "urshr      v1.8h, v1.8h, #1               \n"
    "urshr      v0.8h, v0.8h, #1               \n"
-    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
    RGBTOUV(v2.8h, v1.8h, v0.8h)
    MEMACCESS(2)
    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
@@ -1971,7 +1981,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
    "urshr      v5.8h, v18.8h, #1              \n"
    "urshr      v6.8h, v20.8h, #1              \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
@@ -2042,7 +2052,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
    "urshr      v5.8h, v17.8h, #1              \n"
    "urshr      v6.8h, v18.8h, #1              \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
@@ -2113,7 +2123,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
    "urshr      v5.8h, v17.8h, #1              \n"
    "urshr      v6.8h, v18.8h, #1              \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
@@ -2153,7 +2163,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    RGB565TOARGB
    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
@@ -2183,7 +2193,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGB1555TOARGB
    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
@@ -2212,7 +2222,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    ARGB4444TOARGB
    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
@@ -2241,7 +2251,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
@@ -2269,7 +2279,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
@@ -2297,7 +2307,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
@@ -2325,7 +2335,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
@@ -2353,7 +2363,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
@@ -2380,13 +2390,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
  int y0_fraction = 256 - y1_fraction;
  const uint8* src_ptr1 = src_ptr + src_stride;
  asm volatile (
-    "cmp        %4, #0                         \n"
+    "cmp        %w4, #0                        \n"
    "b.eq       100f                           \n"
-    "cmp        %4, #64                        \n"
+    "cmp        %w4, #64                       \n"
    "b.eq       75f                            \n"
-    "cmp        %4, #128                       \n"
+    "cmp        %w4, #128                      \n"
    "b.eq       50f                            \n"
-    "cmp        %4, #192                       \n"
+    "cmp        %w4, #192                      \n"
    "b.eq       25f                            \n"
    "dup        v5.16b, %w4                    \n"
@@ -2397,7 +2407,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "ld1        {v0.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %3, %3, #16                    \n"
+    "subs       %w3, %w3, #16                  \n"
    "umull      v2.8h, v0.8b,  v4.8b           \n"
    "umull2     v3.8h, v0.16b, v4.16b          \n"
    "umlal      v2.8h, v1.8b,  v5.8b           \n"
@@ -2415,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "ld1        {v0.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %3, %3, #16                    \n"
+    "subs       %w3, %w3, #16                  \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    MEMACCESS(0)
@@ -2429,7 +2439,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "ld1        {v0.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %3, %3, #16                    \n"
+    "subs       %w3, %w3, #16                  \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"
@@ -2442,7 +2452,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "ld1        {v1.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v0.16b}, [%2], #16            \n"
-    "subs       %3, %3, #16                    \n"
+    "subs       %w3, %w3, #16                  \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    MEMACCESS(0)
@@ -2454,7 +2464,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
  "100:                                        \n"
    MEMACCESS(1)
    "ld1        {v0.16b}, [%1], #16            \n"
-    "subs       %3, %3, #16                    \n"
+    "subs       %w3, %w3, #16                  \n"
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"
    "b.gt       100b                           \n"
@@ -2477,7 +2487,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                       uint8* dst_argb, int width) {
  asm volatile (
-    "subs       %3, %3, #8                     \n"
+    "subs       %w3, %w3, #8                   \n"
    "b.lt       89f                            \n"
    // Blend 8 pixels.
  "8:                                          \n"
@@ -2485,7 +2495,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
@@ -2504,7 +2514,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "b.ge       8b                             \n"
  "89:                                         \n"
-    "adds       %3, %3, #8-1                   \n"
+    "adds       %w3, %w3, #8-1                 \n"
    "b.lt       99f                            \n"
    // Blend 1 pixels.
@@ -2513,7 +2523,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
    MEMACCESS(1)
    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
@@ -2552,7 +2562,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
@@ -2586,7 +2596,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
    "uxtl       v1.8h, v1.8b                   \n"
    "uxtl       v2.8h, v2.8b                   \n"
@@ -2630,7 +2640,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
    "uxtl       v5.8h, v5.8b                   \n"
    "uxtl       v6.8h, v6.8b                   \n"
@@ -2667,7 +2677,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
@@ -2706,7 +2716,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
@@ -2746,7 +2756,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
    "uxtl       v17.8h, v17.8b                 \n"  // g
    "uxtl       v18.8h, v18.8b                 \n"  // r
@@ -2808,7 +2818,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
@@ -2842,7 +2852,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uqadd      v0.8b, v0.8b, v4.8b            \n"
    "uqadd      v1.8b, v1.8b, v5.8b            \n"
    "uqadd      v2.8b, v2.8b, v6.8b            \n"
@@ -2872,7 +2882,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
    MEMACCESS(1)
    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uqsub      v0.8b, v0.8b, v4.8b            \n"
    "uqsub      v1.8b, v1.8b, v5.8b            \n"
    "uqsub      v2.8b, v2.8b, v6.8b            \n"
@@ -2907,7 +2917,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
    MEMACCESS(1)
    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
    "orr        v1.8b, v0.8b, v0.8b            \n"
    "orr        v2.8b, v0.8b, v0.8b            \n"
@@ -2935,7 +2945,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
    MEMACCESS(1)
    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
    MEMACCESS(2)
    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
@@ -2966,7 +2976,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
    MEMACCESS(1)
    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
    MEMACCESS(2)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
@@ -3006,7 +3016,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
    MEMACCESS(2)
    "ld1        {v3.8b}, [%2],%6               \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
    "usubl      v1.8h, v2.8b, v3.8b            \n"
    "add        v0.8h, v0.8h, v1.8h            \n"
    "abs        v0.8h, v0.8h                   \n"
@@ -3019,8 +3029,8 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
    "+r"(src_y2),      // %2
    "+r"(dst_sobelx),  // %3
    "+r"(width)        // %4
-  : "r"(2),            // %5
+  : "r"(2LL),          // %5
-    "r"(6)             // %6
+    "r"(6LL)           // %6
  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
@@ -3051,7 +3061,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
    "ld1        {v2.8b}, [%0],%5               \n"  // right
    MEMACCESS(1)
    "ld1        {v3.8b}, [%1],%5               \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
    "usubl      v1.8h, v2.8b, v3.8b            \n"
    "add        v0.8h, v0.8h, v1.8h            \n"
    "abs        v0.8h, v0.8h                   \n"
@@ -3063,8 +3073,8 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
    "+r"(src_y1),      // %1
    "+r"(dst_sobely),  // %2
    "+r"(width)        // %3
-  : "r"(1),            // %4
+  : "r"(1LL),          // %4
-    "r"(6)             // %5
+    "r"(6LL)           // %5
  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
--- a/third_party/libyuv/source/scale.cc
+++ b/third_party/libyuv/source/scale.cc
@@ -23,9 +23,6 @@ namespace libyuv {
 extern "C" {
 #endif
 // Remove this macro if OVERREAD is safe.
 #define AVOID_OVERREAD 1
 static __inline int Abs(int v) {
  return v >= 0 ? v : -v;
 }
@@ -44,9 +41,8 @@ static void ScalePlaneDown2(int src_width, int src_height,
  int y;
  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_C :
+      filtering == kFilterNone ? ScaleRowDown2_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
        ScaleRowDown2Box_C);
  int row_stride = src_stride << 1;
  if (!filtering) {
    src_ptr += src_stride;  // Point to odd rows.
@@ -54,15 +50,39 @@ static void ScalePlaneDown2(int src_width, int src_height,
  }
 #if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
        ScaleRowDown2Box_Any_NEON);
    if (IS_ALIGNED(dst_width, 16)) {
      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
          ScaleRowDown2Box_NEON);
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
-        ScaleRowDown2Box_SSE2);
+        ScaleRowDown2Box_Any_SSE2);
    if (IS_ALIGNED(dst_width, 16)) {
      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
          ScaleRowDown2Box_SSE2);
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN2_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
        ScaleRowDown2Box_Any_AVX2);
    if (IS_ALIGNED(dst_width, 32)) {
      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
          ScaleRowDown2Box_AVX2);
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@@ -154,13 +174,30 @@ static void ScalePlaneDown4(int src_width, int src_height,
    src_stride = 0;
  }
 #if defined(HAS_SCALEROWDOWN4_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    ScaleRowDown4 = filtering ?
        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN4_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+    ScaleRowDown4 = filtering ?
        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN4_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ScaleRowDown4 = filtering ?
        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
    if (IS_ALIGNED(dst_width, 16)) {
      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
@@ -249,24 +286,42 @@ static void ScalePlaneDown34(int src_width, int src_height,
    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
  }
 #if defined(HAS_SCALEROWDOWN34_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
    }
    if (dst_width % 24 == 0) {
      if (!filtering) {
        ScaleRowDown34_0 = ScaleRowDown34_NEON;
        ScaleRowDown34_1 = ScaleRowDown34_NEON;
      } else {
        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
    }
    if (dst_width % 24 == 0) {
      if (!filtering) {
        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
      } else {
        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
      }
    }
  }
 #endif
@@ -422,23 +477,41 @@ static void ScalePlaneDown38(int src_width, int src_height,
    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
  }
 #if defined(HAS_SCALEROWDOWN38_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
    }
    if (dst_width % 12 == 0) {
      if (!filtering) {
        ScaleRowDown38_3 = ScaleRowDown38_NEON;
        ScaleRowDown38_2 = ScaleRowDown38_NEON;
      } else {
        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
      }
    }
  }
 #endif
 #if defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
    if (!filtering) {
      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
    } else {
      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
    }
    if (dst_width % 12 == 0 && !filtering) {
      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
-    } else {
+    }
    if (dst_width % 6 == 0 && filtering) {
      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
    }
@@ -559,65 +632,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
  }
 }
-static __inline uint32 SumBox(int iboxwidth, int iboxheight,
+#define MIN1(x) ((x) < 1 ? 1 : (x))
                              ptrdiff_t src_stride, const uint8* src_ptr) {
  uint32 sum = 0u;
  int y;
  assert(iboxwidth > 0);
  assert(iboxheight > 0);
  for (y = 0; y < iboxheight; ++y) {
    int x;
    for (x = 0; x < iboxwidth; ++x) {
      sum += src_ptr[x];
    }
    src_ptr += src_stride;
  }
  return sum;
 }
 static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
                                 ptrdiff_t src_stride, const uint16* src_ptr) {
  uint32 sum = 0u;
  int y;
  assert(iboxwidth > 0);
  assert(iboxheight > 0);
  for (y = 0; y < iboxheight; ++y) {
    int x;
    for (x = 0; x < iboxwidth; ++x) {
      sum += src_ptr[x];
    }
    src_ptr += src_stride;
  }
  return sum;
 }
 static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
                               int x, int dx, ptrdiff_t src_stride,
                               const uint8* src_ptr, uint8* dst_ptr) {
  int i;
  int boxwidth;
  for (i = 0; i < dst_width; ++i) {
    int ix = x >> 16;
    x += dx;
    boxwidth = (x >> 16) - ix;
    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
        (boxwidth * boxheight);
  }
 }
 static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
                                  int x, int dx, ptrdiff_t src_stride,
                                  const uint16* src_ptr, uint16* dst_ptr) {
  int i;
  int boxwidth;
  for (i = 0; i < dst_width; ++i) {
    int ix = x >> 16;
    x += dx;
    boxwidth = (x >> 16) - ix;
    *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
        (boxwidth * boxheight);
  }
 }
 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
  uint32 sum = 0u;
@@ -643,15 +658,15 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
                            const uint16* src_ptr, uint8* dst_ptr) {
  int i;
  int scaletbl[2];
-  int minboxwidth = (dx >> 16);
+  int minboxwidth = dx >> 16;
  int* scaleptr = scaletbl - minboxwidth;
  int boxwidth;
-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
  for (i = 0; i < dst_width; ++i) {
    int ix = x >> 16;
    x += dx;
-    boxwidth = (x >> 16) - ix;
+    boxwidth = MIN1((x >> 16) - ix);
    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
  }
 }
@@ -660,25 +675,36 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
                               const uint32* src_ptr, uint16* dst_ptr) {
  int i;
  int scaletbl[2];
-  int minboxwidth = (dx >> 16);
+  int minboxwidth = dx >> 16;
  int* scaleptr = scaletbl - minboxwidth;
  int boxwidth;
-  scaletbl[0] = 65536 / (minboxwidth * boxheight);
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
  for (i = 0; i < dst_width; ++i) {
    int ix = x >> 16;
    x += dx;
-    boxwidth = (x >> 16) - ix;
+    boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
+    *dst_ptr++ =
-        scaleptr[boxwidth] >> 16;
+        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
  }
 }
 static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
                            const uint16* src_ptr, uint8* dst_ptr) {
  int scaleval = 65536 / boxheight;
  int i;
  src_ptr += (x >> 16);
  for (i = 0; i < dst_width; ++i) {
    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
  }
 }
 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
                            const uint16* src_ptr, uint8* dst_ptr) {
-  int boxwidth = (dx >> 16);
+  int boxwidth = MIN1(dx >> 16);
  int scaleval = 65536 / (boxwidth * boxheight);
  int i;
  x >>= 16;
  for (i = 0; i < dst_width; ++i) {
    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
    x += boxwidth;
@@ -687,7 +713,7 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
 static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
                               const uint32* src_ptr, uint16* dst_ptr) {
-  int boxwidth = (dx >> 16);
+  int boxwidth = MIN1(dx >> 16);
  int scaleval = 65536 / (boxwidth * boxheight);
  int i;
  for (i = 0; i < dst_width; ++i) {
@@ -707,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
                          int dst_width, int dst_height,
                          int src_stride, int dst_stride,
                          const uint8* src_ptr, uint8* dst_ptr) {
-  int j;
+  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
  int y = 0;
@@ -717,10 +743,40 @@ static void ScalePlaneBox(int src_width, int src_height,
  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
-  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
+  {
-  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
+    // Allocate a row buffer of uint16.
-    uint8* dst = dst_ptr;
+    align_buffer_64(row16, src_width * 2);
-    int j;
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint16* src_ptr, uint8* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_C:
        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
        ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      ScaleAddRow = ScaleAddRow_Any_SSE2;
      if (IS_ALIGNED(src_width, 16)) {
        ScaleAddRow = ScaleAddRow_SSE2;
      }
    }
 #endif
 #if defined(HAS_SCALEADDROW_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      ScaleAddRow = ScaleAddRow_Any_AVX2;
      if (IS_ALIGNED(src_width, 32)) {
        ScaleAddRow = ScaleAddRow_AVX2;
      }
    }
 #endif
 #if defined(HAS_SCALEADDROW_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      ScaleAddRow = ScaleAddRow_Any_NEON;
      if (IS_ALIGNED(src_width, 16)) {
        ScaleAddRow = ScaleAddRow_NEON;
      }
    }
 #endif
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
@@ -729,46 +785,13 @@ static void ScalePlaneBox(int src_width, int src_height,
      if (y > max_y) {
        y = max_y;
      }
-      boxheight = (y >> 16) - iy;
+      boxheight = MIN1((y >> 16) - iy);
-      ScalePlaneBoxRow_C(dst_width, boxheight,
+      memset(row16, 0, src_width * 2);
-                         x, dx, src_stride,
+      for (k = 0; k < boxheight; ++k) {
-                         src, dst);
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
-      dst += dst_stride;
+        src += src_stride;
    }
    return;
  }
  {
    // Allocate a row buffer of uint16.
    align_buffer_64(row16, src_width * 2);
    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint16* src_ptr, uint8* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
 #if defined(HAS_SCALEADDROWS_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)
 #ifdef AVOID_OVERREAD
         && IS_ALIGNED(src_width, 16)
 #endif
        ) {
      ScaleAddRows = ScaleAddRows_SSE2;
    }
 #endif
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
      const uint8* src = src_ptr + iy * src_stride;
      y += dy;
      if (y > (src_height << 16)) {
        y = (src_height << 16);
      }
-      boxheight = (y >> 16) - iy;
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
      ScaleAddRows(src, src_stride, (uint16*)(row16),
                 src_width, boxheight);
      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
                 dst_ptr);
      dst_ptr += dst_stride;
    }
    free_aligned_buffer_64(row16);
@@ -779,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
                             const uint16* src_ptr, uint16* dst_ptr) {
-  int j;
+  int j, k;
  // Initial source x/y coordinate and step values as 16.16 fixed point.
  int x = 0;
  int y = 0;
@@ -789,10 +812,21 @@ static void ScalePlaneBox_16(int src_width, int src_height,
  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
-  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
+  {
-  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
+    // Allocate a row buffer of uint32.
-    uint16* dst = dst_ptr;
+    align_buffer_64(row32, src_width * 4);
-    int j;
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint32* src_ptr, uint16* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
        ScaleAddRow_16_C;
 #if defined(HAS_SCALEADDROW_16_SSE2)
    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
      ScaleAddRow = ScaleAddRow_16_SSE2;
    }
 #endif
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
@@ -801,46 +835,13 @@ static void ScalePlaneBox_16(int src_width, int src_height,
      if (y > max_y) {
        y = max_y;
      }
-      boxheight = (y >> 16) - iy;
+      boxheight = MIN1((y >> 16) - iy);
-      ScalePlaneBoxRow_16_C(dst_width, boxheight,
+      memset(row32, 0, src_width * 4);
-                            x, dx, src_stride,
+      for (k = 0; k < boxheight; ++k) {
-                            src, dst);
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
-      dst += dst_stride;
+        src += src_stride;
    }
    return;
  }
  {
    // Allocate a row buffer of uint32.
    align_buffer_64(row32, src_width * 4);
    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
        const uint32* src_ptr, uint16* dst_ptr) =
        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
    void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
        uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
 #if defined(HAS_SCALEADDROWS_16_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)
 #ifdef AVOID_OVERREAD
        && IS_ALIGNED(src_width, 16)
 #endif
        ) {
      ScaleAddRows = ScaleAddRows_16_SSE2;
    }
 #endif
    for (j = 0; j < dst_height; ++j) {
      int boxheight;
      int iy = y >> 16;
      const uint16* src = src_ptr + iy * src_stride;
      y += dy;
      if (y > (src_height << 16)) {
        y = (src_height << 16);
      }
-      boxheight = (y >> 16) - iy;
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
      ScaleAddRows(src, src_stride, (uint32*)(row32),
                 src_width, boxheight);
      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
                 dst_ptr);
      dst_ptr += dst_stride;
    }
    free_aligned_buffer_64(row32);
@@ -920,6 +921,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
    ScaleFilterCols = ScaleFilterCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEFILTERCOLS_NEON)
  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
    ScaleFilterCols = ScaleFilterCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleFilterCols = ScaleFilterCols_NEON;
    }
  }
 #endif
  if (y > max_y) {
    y = max_y;
@@ -1057,8 +1066,8 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
      InterpolateRow_C;
  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-       int dst_width, int x, int dx) =
+      int dst_width, int x, int dx) =
-       filtering ? ScaleFilterCols_C : ScaleCols_C;
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
@@ -1111,6 +1120,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
    ScaleFilterCols = ScaleFilterCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEFILTERCOLS_NEON)
  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
    ScaleFilterCols = ScaleFilterCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleFilterCols = ScaleFilterCols_NEON;
    }
  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleFilterCols = ScaleColsUp2_C;
@@ -1129,7 +1146,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
    const uint8* src = src_ptr + yi * src_stride;
    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 15) & ~15;
+    const int kRowSize = (dst_width + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
    uint8* rowptr = row;
@@ -1188,8 +1205,8 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
      InterpolateRow_16_C;
  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-       int dst_width, int x, int dx) =
+      int dst_width, int x, int dx) =
-       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
@@ -1260,7 +1277,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
    const uint16* src = src_ptr + yi * src_stride;
    // Allocate 2 row buffers.
-    const int kRowSize = (dst_width + 15) & ~15;
+    const int kRowSize = (dst_width + 31) & ~31;
    align_buffer_64(row, kRowSize * 4);
    uint16* rowptr = (uint16*)row;
@@ -1334,8 +1351,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
  }
  for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
              dst_width, x, dx);
    dst_ptr += dst_stride;
    y += dy;
  }
@@ -1385,8 +1401,7 @@ void ScalePlane(const uint8* src, int src_stride,
                enum FilterMode filtering) {
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+                                dst_width, dst_height, filtering);
                                filtering);
  // Negative height means invert the image.
  if (src_height < 0) {
@@ -1402,9 +1417,9 @@ void ScalePlane(const uint8* src, int src_stride,
    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
    return;
  }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
    int dy = FixedDiv(src_height, dst_height);
-    // Arbitrary scale vertically, but unscaled vertically.
+    // Arbitrary scale vertically, but unscaled horizontally.
    ScalePlaneVertical(src_height,
                       dst_width, dst_height,
                       src_stride, dst_stride, src, dst,
@@ -1435,7 +1450,7 @@ void ScalePlane(const uint8* src, int src_stride,
      return;
    }
    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        (filtering == kFilterBox || filtering == kFilterNone)) {
      // optimized, 1/4
      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                      src_stride, dst_stride, src, dst, filtering);
@@ -1469,8 +1484,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
                  enum FilterMode filtering) {
  // Simplify filtering when possible.
  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+                                dst_width, dst_height, filtering);
                                filtering);
  // Negative height means invert the image.
  if (src_height < 0) {
@@ -1563,6 +1577,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
      src_width > 32768 || src_height > 32768 ||
      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@@ -1594,6 +1609,7 @@ int I420Scale_16(const uint16* src_y, int src_stride_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
      src_width > 32768 || src_height > 32768 ||
      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
--- a/third_party/libyuv/source/scale_any.cc
+++ b/third_party/libyuv/source/scale_any.cc
@@ -0,0 +1,200 @@
 /*
 *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS. All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
 #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
                 int dst_width, int x, int dx) {                               \
      int n = dst_width & ~MASK;                                               \
      if (n > 0) {                                                             \
        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
      }                                                                        \
      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
             dst_width & MASK, x + n * dx, dx);                                \
    }
 #ifdef HAS_SCALEFILTERCOLS_NEON
 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
 #endif
 #ifdef HAS_SCALEARGBCOLS_NEON
 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
 #endif
 #ifdef HAS_SCALEARGBFILTERCOLS_NEON
 CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
     ScaleARGBFilterCols_C, 4, 3)
 #endif
 #undef CANY
 // Fixed scale down.
 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
                 uint8* dst_ptr, int dst_width) {                              \
      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
      int n = dst_width - r;                                                   \
      if (n > 0) {                                                             \
        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
      }                                                                        \
      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
                     dst_ptr + n * BPP, r);                                    \
    }
 #ifdef HAS_SCALEROWDOWN2_SSE2
 SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
 SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
      ScaleRowDown2Linear_C, 2, 1, 15)
 SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
      2, 1, 15)
 #endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
 SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
      ScaleRowDown2Linear_C, 2, 1, 31)
 SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
      2, 1, 31)
 #endif
 #ifdef HAS_SCALEROWDOWN2_NEON
 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
 SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
      ScaleRowDown2Linear_C, 2, 1, 15)
 SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
      ScaleRowDown2Box_C, 2, 1, 15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_SSE2
 SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
 SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
      4, 1, 7)
 #endif
 #ifdef HAS_SCALEROWDOWN4_AVX2
 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
 SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
      4, 1, 15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_NEON
 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
 SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
      4, 1, 7)
 #endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
      ScaleRowDown34_C, 4 / 3, 1, 23)
 SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
 SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
 #endif
 #ifdef HAS_SCALEROWDOWN34_NEON
 SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
      ScaleRowDown34_C, 4 / 3, 1, 23)
 SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
 SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
 #endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
      ScaleRowDown38_C, 8 / 3, 1, 11)
 SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
 SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
 #endif
 #ifdef HAS_SCALEROWDOWN38_NEON
 SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
      ScaleRowDown38_C, 8 / 3, 1, 11)
 SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
 SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
 SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
      ScaleARGBRowDown2_C, 2, 4, 3)
 SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
      ScaleARGBRowDown2Linear_C, 2, 4, 3)
 SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
      ScaleARGBRowDown2Box_C, 2, 4, 3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_NEON
 SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
      ScaleARGBRowDown2_C, 2, 4, 7)
 SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
      ScaleARGBRowDown2Linear_C, 2, 4, 7)
 SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
      ScaleARGBRowDown2Box_C, 2, 4, 7)
 #endif
 #undef SDANY
 // Scale down by even scale factor.
 #define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
                 uint8* dst_ptr, int dst_width) {                              \
      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
      int n = dst_width - r;                                                   \
      if (n > 0) {                                                             \
        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
      }                                                                        \
      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
                     src_stepx, dst_ptr + n * BPP, r);                         \
    }
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
 SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
       ScaleARGBRowDownEven_C, 4, 3)
 SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
       ScaleARGBRowDownEvenBox_C, 4, 3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
 SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
       ScaleARGBRowDownEven_C, 4, 3)
 SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
       ScaleARGBRowDownEvenBox_C, 4, 3)
 #endif
 // Add rows box filter scale down.
 #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
      int n = src_width & ~MASK;                                               \
      if (n > 0) {                                                             \
        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
      }                                                                        \
      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
    }
 #ifdef HAS_SCALEADDROW_SSE2
 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
 #endif
 #ifdef HAS_SCALEADDROW_AVX2
 SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
 #endif
 #ifdef HAS_SCALEADDROW_NEON
 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
 #undef SAANY
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/third_party/libyuv/source/scale_argb.cc
+++ b/third_party/libyuv/source/scale_argb.cc
@@ -53,16 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height,
  }
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_SSE2);
+        ScaleARGBRowDown2Box_Any_SSE2);
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
          ScaleARGBRowDown2Box_SSE2);
    }
  }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        ScaleARGBRowDown2_NEON;
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
        ScaleARGBRowDown2Box_Any_NEON);
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
          ScaleARGBRowDown2Box_NEON);
    }
  }
 #endif
@@ -86,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
                              int x, int dx, int y, int dy) {
  int j;
  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
  align_buffer_64(row, kRowSize * 2);
  int row_stride = src_stride * (dy >> 16);
  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@@ -96,15 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
  assert(dx == 65536 * 4);  // Test scale factor of 4.
  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
    }
  }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
    }
  }
 #endif
  for (j = 0; j < dst_height; ++j) {
    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@@ -135,15 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height,
  assert(IS_ALIGNED(src_height, 2));
  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_SSE2;
+        ScaleARGBRowDownEven_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
          ScaleARGBRowDownEven_SSE2;
    }
  }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
+  if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_NEON;
+        ScaleARGBRowDownEven_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
          ScaleARGBRowDownEven_NEON;
    }
  }
 #endif
@@ -229,6 +255,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERCOLS_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
    }
  }
 #endif
  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
  // Allocate a row of ARGB.
@@ -321,10 +355,26 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERCOLS_NEON)
  if (filtering && TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
    }
  }
 #endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
 #endif
 #if defined(HAS_SCALEARGBCOLS_NEON)
  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleARGBFilterCols = ScaleARGBCols_NEON;
    }
  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -344,7 +394,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
    const uint8* src = src_argb + yi * src_stride;
    // Allocate 2 rows of ARGB.
-    const int kRowSize = (dst_width * 4 + 15) & ~15;
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
    align_buffer_64(row, kRowSize * 2);
    uint8* rowptr = row;
@@ -495,10 +545,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERCOLS_NEON)
  if (filtering && TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
    }
  }
 #endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
  }
 #endif
 #if defined(HAS_SCALEARGBCOLS_NEON)
  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleARGBFilterCols = ScaleARGBCols_NEON;
    }
  }
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -521,7 +587,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
  // Allocate 2 rows of ARGB.
-  const int kRowSize = (dst_width * 4 + 15) & ~15;
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
  align_buffer_64(row, kRowSize * 2);
  // Allocate 1 row of ARGB for source conversion.
@@ -606,6 +672,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
    ScaleARGBCols = ScaleARGBCols_SSE2;
  }
 #endif
 #if defined(HAS_SCALEARGBCOLS_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ScaleARGBCols = ScaleARGBCols_Any_NEON;
    if (IS_ALIGNED(dst_width, 8)) {
      ScaleARGBCols = ScaleARGBCols_NEON;
    }
  }
 #endif
  if (src_width * 2 == dst_width && x < 0x8000) {
    ScaleARGBCols = ScaleARGBColsUp2_C;
@@ -744,6 +818,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
  if (!src_argb || src_width == 0 || src_height == 0 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
      clip_x < 0 || clip_y < 0 ||
      clip_width > 32768 || clip_height > 32768 ||
      (clip_x + clip_width) > dst_width ||
      (clip_y + clip_height) > dst_height) {
    return -1;
@@ -762,6 +837,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
              int dst_width, int dst_height,
              enum FilterMode filtering) {
  if (!src_argb || src_width == 0 || src_height == 0 ||
      src_width > 32768 || src_height > 32768 ||
      !dst_argb || dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
--- a/third_party/libyuv/source/scale_common.cc
+++ b/third_party/libyuv/source/scale_common.cc
@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
  }
 }
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
                    uint16* dst_ptr, int src_width, int src_height) {
  int x;
  assert(src_width > 0);
-  assert(src_height > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
-  for (x = 0; x < src_width; ++x) {
+    dst_ptr[0] += src_ptr[0];
-    const uint8* s = src_ptr + x;
+    dst_ptr[1] += src_ptr[1];
-    unsigned int sum = 0u;
+    src_ptr += 2;
-    int y;
+    dst_ptr += 2;
-    for (y = 0; y < src_height; ++y) {
+  }
-      sum += s[0];
+  if (src_width & 1) {
-      s += src_stride;
+    dst_ptr[0] += src_ptr[0];
    }
    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
    dst_ptr[x] = sum < 65535u ? sum : 65535u;
  }
 }
-void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
                       uint32* dst_ptr, int src_width, int src_height) {
  int x;
  assert(src_width > 0);
-  assert(src_height > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
-  for (x = 0; x < src_width; ++x) {
+    dst_ptr[0] += src_ptr[0];
-    const uint16* s = src_ptr + x;
+    dst_ptr[1] += src_ptr[1];
-    unsigned int sum = 0u;
+    src_ptr += 2;
-    int y;
+    dst_ptr += 2;
-    for (y = 0; y < src_height; ++y) {
+  }
-      sum += s[0];
+  if (src_width & 1) {
-      s += src_stride;
+    dst_ptr[0] += src_ptr[0];
    }
    // No risk of overflow here now
    dst_ptr[x] = sum;
  }
 }
@@ -1030,10 +1022,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
      filtering = kFilterBilinear;
    }
    // If scaling to larger, switch from Box to Bilinear.
    if (dst_width >= src_width || dst_height >= src_height) {
      filtering = kFilterBilinear;
    }
  }
  if (filtering == kFilterBilinear) {
    if (src_height == 1) {
--- a/third_party/libyuv/source/scale_posix.cc
+++ b/third_party/libyuv/source/scale_posix.cc
@@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
  );
 }
 // Reads 16xN bytes and produces 16 shorts at a time.
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                       uint16* dst_ptr, int src_width, int src_height) {
  int tmp_height = 0;
  intptr_t tmp_src = 0;
  asm volatile (
    "mov       %0,%3                           \n"  // row pointer
    "mov       %5,%2                           \n"  // height
    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
    "pxor      %%xmm1,%%xmm1                   \n"
    "pxor      %%xmm4,%%xmm4                   \n"
    "sub       $0x1,%5                         \n"
    LABELALIGN
  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
-    "mov       %0,%3                           \n"
+    "add       %6,%3                           \n"
    "add       %6,%0                           \n"
    "movdqa    %%xmm0,%%xmm1                   \n"
    "punpcklbw %%xmm4,%%xmm0                   \n"
    "punpckhbw %%xmm4,%%xmm1                   \n"
    "mov       %5,%2                           \n"
    "test      %2,%2                           \n"
    "je        3f                              \n"
    LABELALIGN
  "2:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
    "add       %6,%0                           \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "punpcklbw %%xmm4,%%xmm2                   \n"
    "punpckhbw %%xmm4,%%xmm3                   \n"
    "paddusw   %%xmm2,%%xmm0                   \n"
    "paddusw   %%xmm3,%%xmm1                   \n"
    "sub       $0x1,%2                         \n"
-    "jg        2b                              \n"
+    "jg        1b                              \n"
    LABELALIGN
  "3:                                          \n"
    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
    "lea       " MEMLEA(0x10,3) ",%0           \n"
    "lea       " MEMLEA(0x20,1) ",%1           \n"
    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
    "mov       %0,%3                           \n"  // row pointer
    "mov       %5,%2                           \n"  // height
    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
    "pxor      %%xmm1,%%xmm1                   \n"
    "sub       $0x10,%4                        \n"
    "jg        1b                              \n"
  : "+r"(src_ptr),     // %0
@@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
                               uint8* dst_argb, int dst_width) {
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  intptr_t src_stepx_x12 = 0;
  asm volatile (
--- a/third_party/libyuv/source/scale_neon.cc
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }
 // Read 32x1 average down and write 16x1.
 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    MEMACCESS(1)
    "vst1.8     {q0}, [%1]!                    \n"
    "bgt        1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "q0", "q1"     // Clobber List
  );
 }
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
@@ -517,6 +541,112 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
  );
 }
 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
  const uint8* src_tmp = NULL;
  asm volatile (
    ".p2align   2                              \n"
  "1:                                          \n"
    "mov       %0, %1                          \n"
    "mov       r12, %5                         \n"
    "veor      q2, q2, q2                      \n"
    "veor      q3, q3, q3                      \n"
  "2:                                          \n"
    // load 16 pixels into q0
    MEMACCESS(0)
    "vld1.8     {q0}, [%0], %3                 \n"
    "vaddw.u8   q3, q3, d1                     \n"
    "vaddw.u8   q2, q2, d0                     \n"
    "subs       r12, r12, #1                   \n"
    "bgt        2b                             \n"
    MEMACCESS(2)
    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
    "add        %1, %1, #16                    \n"
    "subs       %4, %4, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
  : "+r"(src_tmp),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_ptr),          // %2
    "+r"(src_stride),       // %3
    "+r"(src_width),        // %4
    "+r"(src_height)        // %5
  :
  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
  );
 }
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
    "add        %6, %1, %5                     \n"             \
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_ptr;
  asm volatile (
    ".p2align   2                              \n"
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q1, q1, q0                     \n"
    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
    "vadd.s32   q2, q1, q3                     \n"
    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
  "1:                                          \n"
    LOAD2_DATA8_LANE(0)
    LOAD2_DATA8_LANE(1)
    LOAD2_DATA8_LANE(2)
    LOAD2_DATA8_LANE(3)
    LOAD2_DATA8_LANE(4)
    LOAD2_DATA8_LANE(5)
    LOAD2_DATA8_LANE(6)
    LOAD2_DATA8_LANE(7)
    "vmov       q10, q1                        \n"
    "vmov       q11, q2                        \n"
    "vuzp.16    q10, q11                       \n"
    "vmovl.u8   q8, d6                         \n"
    "vmovl.u8   q9, d7                         \n"
    "vsubl.s16  q11, d18, d16                  \n"
    "vsubl.s16  q12, d19, d17                  \n"
    "vmovl.u16  q13, d20                       \n"
    "vmovl.u16  q10, d21                       \n"
    "vmul.s32   q11, q11, q13                  \n"
    "vmul.s32   q12, q12, q10                  \n"
    "vshrn.s32  d18, q11, #16                  \n"
    "vshrn.s32  d19, q12, #16                  \n"
    "vadd.s16   q8, q8, q9                     \n"
    "vmovn.s16  d6, q8                         \n"
    MEMACCESS(0)
    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
    "vadd.s32   q1, q1, q0                     \n"
    "vadd.s32   q2, q2, q0                     \n"
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3",
    "q8", "q9", "q10", "q11", "q12", "q13"
  );
 }
 #undef LOAD2_DATA8_LANE
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
@@ -640,6 +770,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }
 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
    ".p2align   2                              \n"
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
    MEMACCESS(0)
    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #1                     \n"
    "vrshrn.u16 d2, q2, #1                     \n"
    "vrshrn.u16 d3, q3, #1                     \n"
    MEMACCESS(1)
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
    "bgt       1b                              \n"
  : "+r"(src_argb),         // %0
    "+r"(dst_argb),         // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
  );
 }
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
@@ -757,6 +916,119 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
  );
 }
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD1_DATA32_LANE(dn, n)                               \
    "lsr        %5, %3, #16                    \n"             \
    "add        %6, %1, %5, lsl #2             \n"             \
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
  int tmp = 0;
  const uint8* src_tmp = src_argb;
  asm volatile (
    ".p2align   2                              \n"
  "1:                                          \n"
    LOAD1_DATA32_LANE(d0, 0)
    LOAD1_DATA32_LANE(d0, 1)
    LOAD1_DATA32_LANE(d1, 0)
    LOAD1_DATA32_LANE(d1, 1)
    LOAD1_DATA32_LANE(d2, 0)
    LOAD1_DATA32_LANE(d2, 1)
    LOAD1_DATA32_LANE(d3, 0)
    LOAD1_DATA32_LANE(d3, 1)
    MEMACCESS(0)
    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    "bgt        1b                             \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1"
  );
 }
 #undef LOAD1_DATA32_LANE
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
    "lsr        %5, %3, #16                           \n"      \
    "add        %6, %1, %5, lsl #2                    \n"      \
    "add        %3, %3, %4                            \n"      \
    MEMACCESS(6)                                               \
    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_argb;
  asm volatile (
    ".p2align   2                              \n"
    "vdup.32    q0, %3                         \n"  // x
    "vdup.32    q1, %4                         \n"  // dx
    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
    "vmul.s32   q1, q1, q2                     \n"
    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "vadd.s32   q8, q1, q0                     \n"
  "1:                                          \n"
    // d0, d1: a
    // d2, d3: b
    LOAD2_DATA32_LANE(d0, d2, 0)
    LOAD2_DATA32_LANE(d0, d2, 1)
    LOAD2_DATA32_LANE(d1, d3, 0)
    LOAD2_DATA32_LANE(d1, d3, 1)
    "vshrn.i32   d22, q8, #9                   \n"
    "vand.16     d22, d22, d30                 \n"
    "vdup.8      d24, d22[0]                   \n"
    "vdup.8      d25, d22[2]                   \n"
    "vdup.8      d26, d22[4]                   \n"
    "vdup.8      d27, d22[6]                   \n"
    "vext.8      d4, d24, d25, #4              \n"
    "vext.8      d5, d26, d27, #4              \n"  // f
    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
    "vmull.u8    q11, d0, d20                  \n"
    "vmull.u8    q12, d1, d21                  \n"
    "vmull.u8    q13, d2, d4                   \n"
    "vmull.u8    q14, d3, d5                   \n"
    "vadd.i16    q11, q11, q13                 \n"
    "vadd.i16    q12, q12, q14                 \n"
    "vshrn.i16   d0, q11, #7                   \n"
    "vshrn.i16   d1, q12, #7                   \n"
    MEMACCESS(0)
    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
    "vadd.s32    q8, q8, q9                    \n"
    "subs        %2, %2, #4                    \n"  // 4 processed per loop
    "bgt         1b                            \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
    "+r"(x),                // %3
    "+r"(dx),               // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
    "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #undef LOAD2_DATA32_LANE
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 #ifdef __cplusplus
--- a/third_party/libyuv/source/scale_neon64.cc
+++ b/third_party/libyuv/source/scale_neon64.cc
@@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  "1:                                          \n"
    // load even pixels into v0, odd into v1
    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32    \n"
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
    MEMACCESS(1)
    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
    "b.gt       1b                             \n"
@@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }
 // Read 32x1 average down and write 16x1.
 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
    "uaddlp     v1.8h, v1.16b                  \n"
    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
    "rshrn2     v0.16b, v1.8h, #1              \n"
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"
    "b.gt       1b                             \n"
  : "+r"(src_ptr),          // %0
    "+r"(dst),              // %1
    "+r"(dst_width)         // %2
  :
  : "v0", "v1"     // Clobber List
  );
 }
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
@@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
    MEMACCESS(1)
    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
    "uaddlp     v1.8h, v1.16b                  \n"
    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
@@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  "1:                                          \n"
    MEMACCESS(0)
    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    MEMACCESS(1)
    "st1     {v2.8b}, [%1], #8                 \n"
    "b.gt       1b                             \n"
@@ -103,7 +126,7 @@ asm volatile (
    "ld1     {v2.16b}, [%3], #16               \n"
    MEMACCESS(5)
    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs    %5, %5, #4                        \n"
+    "subs    %w5, %w5, #4                      \n"
    "uaddlp  v0.8h, v0.16b                     \n"
    "uadalp  v0.8h, v1.16b                     \n"
    "uadalp  v0.8h, v2.16b                     \n"
@@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
  "1:                                                  \n"
    MEMACCESS(0)
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    "subs      %2, %2, #24                             \n"
+    "subs      %w2, %w2, #24                           \n"
    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
    MEMACCESS(1)
    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
@@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
    MEMACCESS(3)
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %2, %2, #24                          \n"
+    "subs         %w2, %w2, #24                        \n"
    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
@@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
    MEMACCESS(3)
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %2, %2, #24                          \n"
+    "subs         %w2, %w2, #24                        \n"
    // average src line 0 with src line 1
    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
@@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
  "1:                                                  \n"
    MEMACCESS(0)
    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
-    "subs      %2, %2, #12                             \n"
+    "subs      %w2, %w2, #12                           \n"
    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
    MEMACCESS(1)
    "st1       {v2.8b}, [%1], #8                       \n"
@@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
    MEMACCESS(4)
    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %4, %4, #12                             \n"
+    "subs      %w4, %w4, #12                           \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
    MEMACCESS(3)
    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    "subs      %3, %3, #12                             \n"
+    "subs      %w3, %w3, #12                           \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
  );
 }
 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                    uint16* dst_ptr, int src_width, int src_height) {
  const uint8* src_tmp = NULL;
  asm volatile (
  "1:                                          \n"
    "mov       %0, %1                          \n"
    "mov       w12, %w5                        \n"
    "eor       v2.16b, v2.16b, v2.16b          \n"
    "eor       v3.16b, v3.16b, v3.16b          \n"
  "2:                                          \n"
    // load 16 pixels into q0
    MEMACCESS(0)
    "ld1       {v0.16b}, [%0], %3              \n"
    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
    "uaddw     v2.8h, v2.8h, v0.8b             \n"
    "subs      w12, w12, #1                    \n"
    "b.gt      2b                              \n"
    MEMACCESS(2)
    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
    "add      %1, %1, #16                      \n"
    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
    "b.gt     1b                               \n"
  : "+r"(src_tmp),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_ptr),          // %2
    "+r"(src_stride),       // %3
    "+r"(src_width),        // %4
    "+r"(src_height)        // %5
  :
  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA8_LANE(n)                                    \
    "lsr        %5, %3, #16                    \n"             \
    "add        %6, %1, %5                    \n"              \
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_ptr;
  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
  int64 x64 = (int64) x;
  int64 dx64 = (int64) dx;
  asm volatile (
    "dup        v0.4s, %w3                     \n"  // x
    "dup        v1.4s, %w4                     \n"  // dx
    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
    "mul        v1.4s, v1.4s, v2.4s            \n"
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "add        v1.4s, v1.4s, v0.4s            \n"
    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
    "add        v2.4s, v1.4s, v3.4s            \n"
    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
  "1:                                          \n"
    LOAD2_DATA8_LANE(0)
    LOAD2_DATA8_LANE(1)
    LOAD2_DATA8_LANE(2)
    LOAD2_DATA8_LANE(3)
    LOAD2_DATA8_LANE(4)
    LOAD2_DATA8_LANE(5)
    LOAD2_DATA8_LANE(6)
    LOAD2_DATA8_LANE(7)
    "mov       v6.16b, v1.16b                  \n"
    "mov       v7.16b, v2.16b                  \n"
    "uzp1      v6.8h, v6.8h, v7.8h             \n"
    "ushll     v4.8h, v4.8b, #0                \n"
    "ushll     v5.8h, v5.8b, #0                \n"
    "ssubl     v16.4s, v5.4h, v4.4h            \n"
    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
    "ushll     v7.4s, v6.4h, #0                \n"
    "ushll2    v6.4s, v6.8h, #0                \n"
    "mul       v16.4s, v16.4s, v7.4s           \n"
    "mul       v17.4s, v17.4s, v6.4s           \n"
    "shrn      v6.4h, v16.4s, #16              \n"
    "shrn2     v6.8h, v17.4s, #16              \n"
    "add       v4.8h, v4.8h, v6.8h             \n"
    "xtn       v4.8b, v4.8h                    \n"
    MEMACCESS(0)
    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
    "add       v1.4s, v1.4s, v0.4s             \n"
    "add       v2.4s, v2.4s, v0.4s             \n"
    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
    "b.gt      1b                              \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_width64),      // %2
    "+r"(x64),              // %3
    "+r"(dx64),             // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "v0", "v1", "v2", "v3",
    "v4", "v5", "v6", "v7", "v16", "v17"
  );
 }
 #undef LOAD2_DATA8_LANE
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
    int y_fraction = 256 - source_y_fraction;
  asm volatile (
-    "cmp          %4, #0                       \n"
+    "cmp          %w4, #0                      \n"
    "b.eq         100f                         \n"
    "add          %2, %2, %1                   \n"
-    "cmp          %4, #64                      \n"
+    "cmp          %w4, #64                     \n"
    "b.eq         75f                          \n"
-    "cmp          %4, #128                     \n"
+    "cmp          %w4, #128                    \n"
    "b.eq         50f                          \n"
-    "cmp          %4, #192                     \n"
+    "cmp          %w4, #192                    \n"
    "b.eq         25f                          \n"
    "dup          v5.8b, %w4                   \n"
@@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "ld1          {v0.16b}, [%1], #16          \n"
    MEMACCESS(2)
    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %3, %3, #16                  \n"
+    "subs         %w3, %w3, #16                \n"
    "umull        v6.8h, v0.8b, v4.8b          \n"
    "umull2       v7.8h, v0.16b, v4.16b        \n"
    "umlal        v6.8h, v1.8b, v5.8b          \n"
@@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "ld1          {v0.16b}, [%1], #16          \n"
    MEMACCESS(2)
    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %3, %3, #16                  \n"
+    "subs         %w3, %w3, #16                \n"
    "urhadd       v0.16b, v0.16b, v1.16b       \n"
    "urhadd       v0.16b, v0.16b, v1.16b       \n"
    MEMACCESS(0)
@@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "ld1          {v0.16b}, [%1], #16          \n"
    MEMACCESS(2)
    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %3, %3, #16                  \n"
+    "subs         %w3, %w3, #16                \n"
    "urhadd       v0.16b, v0.16b, v1.16b       \n"
    MEMACCESS(0)
    "st1          {v0.16b}, [%0], #16          \n"
@@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
    "ld1          {v1.16b}, [%1], #16          \n"
    MEMACCESS(2)
    "ld1          {v0.16b}, [%2], #16          \n"
-    "subs         %3, %3, #16                  \n"
+    "subs         %w3, %w3, #16                \n"
    "urhadd       v0.16b, v0.16b, v1.16b       \n"
    "urhadd       v0.16b, v0.16b, v1.16b       \n"
    MEMACCESS(0)
@@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
  "100:                                        \n"
    MEMACCESS(1)
    "ld1          {v0.16b}, [%1], #16          \n"
-    "subs         %3, %3, #16                  \n"
+    "subs         %w3, %w3, #16                \n"
    MEMACCESS(0)
    "st1          {v0.16b}, [%0], #16          \n"
    "b.gt         100b                         \n"
@@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
    MEMACCESS (0)
    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    MEMACCESS (1)
    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
    MEMACCESS (1)
@@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  );
 }
 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS (0)
    // load 8 ARGB pixels.
    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
    "rshrn      v1.8b, v1.8h, #1               \n"
    "rshrn      v2.8b, v2.8h, #1               \n"
    "rshrn      v3.8b, v3.8h, #1               \n"
    MEMACCESS (1)
    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
    "b.gt       1b                             \n"
  : "+r"(src_argb),         // %0
    "+r"(dst_argb),         // %1
    "+r"(dst_width)         // %2
  :
  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
  );
 }
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst, int dst_width) {
  asm volatile (
@@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
  "1:                                          \n"
    MEMACCESS (0)
    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
@@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
    "ld1        {v0.s}[2], [%0], %3            \n"
    MEMACCESS(0)
    "ld1        {v0.s}[3], [%0], %3            \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
    MEMACCESS(1)
    "st1        {v0.16b}, [%1], #16            \n"
    "b.gt       1b                             \n"
  : "+r"(src_argb),    // %0
    "+r"(dst_argb),    // %1
    "+r"(dst_width)    // %2
-  : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
+  : "r"((int64)(src_stepx * 4)) // %3
  : "memory", "cc", "v0"
  );
 }
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-// TODO, might be worth another optimization pass in future.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
 // It could be upgraded to 8 pixels at a time to start with.
 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                  int src_stepx,
@@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
    "add        %1, %1, %0                     \n"
  "1:                                          \n"
    MEMACCESS(0)
-    "ld1     {v0.8b}, [%0], %4                 \n"  // Read 4 2x2 blocks -> 2x1
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
    MEMACCESS(1)
-    "ld1     {v1.8b}, [%1], %4                 \n"
+    "ld1        {v1.8b}, [%1], %4              \n"
    MEMACCESS(0)
-    "ld1     {v2.8b}, [%0], %4                 \n"
+    "ld1        {v2.8b}, [%0], %4              \n"
    MEMACCESS(1)
-    "ld1     {v3.8b}, [%1], %4                 \n"
+    "ld1        {v3.8b}, [%1], %4              \n"
    MEMACCESS(0)
-    "ld1     {v4.8b}, [%0], %4                 \n"
+    "ld1        {v4.8b}, [%0], %4              \n"
    MEMACCESS(1)
-    "ld1     {v5.8b}, [%1], %4                 \n"
+    "ld1        {v5.8b}, [%1], %4              \n"
    MEMACCESS(0)
-    "ld1     {v6.8b}, [%0], %4                 \n"
+    "ld1        {v6.8b}, [%0], %4              \n"
    MEMACCESS(1)
-    "ld1     {v7.8b}, [%1], %4                 \n"
+    "ld1        {v7.8b}, [%1], %4              \n"
-    "uaddl   v0.8h, v0.8b, v1.8b               \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
-    "uaddl   v2.8h, v2.8b, v3.8b               \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
-    "uaddl   v4.8h, v4.8b, v5.8b               \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
-    "uaddl   v6.8h, v6.8b, v7.8b               \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
-    "mov     v16.d[1], v0.d[1]                 \n"  // ab_cd -> ac_bd
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-    "mov     v0.d[1], v2.d[0]                  \n"
+    "mov        v0.d[1], v2.d[0]               \n"
-    "mov     v2.d[0], v16.d[1]                 \n"
+    "mov        v2.d[0], v16.d[1]              \n"
-    "mov     v16.d[1], v4.d[1]                 \n"  // ef_gh -> eg_fh
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-    "mov     v4.d[1], v6.d[0]                  \n"
+    "mov        v4.d[1], v6.d[0]               \n"
-    "mov     v6.d[0], v16.d[1]                 \n"
+    "mov        v6.d[0], v16.d[1]              \n"
-    "add     v0.8h, v0.8h, v2.8h               \n"  // (a+b)_(c+d)
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-    "add     v4.8h, v4.8h, v6.8h               \n"  // (e+f)_(g+h)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-    "rshrn   v0.8b, v0.8h, #2                  \n"  // first 2 pixels.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-    "rshrn2  v0.16b, v4.8h, #2                 \n"  // next 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
    MEMACCESS(2)
    "st1     {v0.16b}, [%2], #16               \n"
    "b.gt       1b                             \n"
@@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
    "+r"(src_stride),  // %1
    "+r"(dst_argb),    // %2
    "+r"(dst_width)    // %3
-  : "r"(src_stepx * 4) // %4
+  : "r"((int64)(src_stepx * 4)) // %4
  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
  );
 }
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD1_DATA32_LANE(vn, n)                               \
    "lsr        %5, %3, #16                    \n"             \
    "add        %6, %1, %5, lsl #2             \n"             \
    "add        %3, %3, %4                     \n"             \
    MEMACCESS(6)                                               \
    "ld1        {"#vn".s}["#n"], [%6]          \n"
 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
  const uint8* src_tmp = src_argb;
  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
  int64 x64 = (int64) x;
  int64 dx64 = (int64) dx;
  int64 tmp64 = 0;
  asm volatile (
  "1:                                          \n"
    LOAD1_DATA32_LANE(v0, 0)
    LOAD1_DATA32_LANE(v0, 1)
    LOAD1_DATA32_LANE(v0, 2)
    LOAD1_DATA32_LANE(v0, 3)
    LOAD1_DATA32_LANE(v1, 0)
    LOAD1_DATA32_LANE(v1, 1)
    LOAD1_DATA32_LANE(v1, 2)
    LOAD1_DATA32_LANE(v1, 3)
    MEMACCESS(0)
    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    "b.gt        1b                            \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width64),      // %2
    "+r"(x64),              // %3
    "+r"(dx64),             // %4
    "+r"(tmp64),            // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "v0", "v1"
  );
 }
 #undef LOAD1_DATA32_LANE
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
 #define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
    "lsr        %5, %3, #16                           \n"      \
    "add        %6, %1, %5, lsl #2                    \n"      \
    "add        %3, %3, %4                            \n"      \
    MEMACCESS(6)                                               \
    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
                              int dst_width, int x, int dx) {
  int dx_offset[4] = {0, 1, 2, 3};
  int* tmp = dx_offset;
  const uint8* src_tmp = src_argb;
  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
  int64 x64 = (int64) x;
  int64 dx64 = (int64) dx;
  asm volatile (
    "dup        v0.4s, %w3                     \n"  // x
    "dup        v1.4s, %w4                     \n"  // dx
    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
    "mul        v1.4s, v1.4s, v2.4s            \n"
    "movi       v3.16b, #0x7f                  \n"  // 0x7F
    "movi       v4.8h, #0x7f                   \n"  // 0x7F
    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
    "add        v5.4s, v1.4s, v0.4s            \n"
  "1:                                          \n"
    // d0, d1: a
    // d2, d3: b
    LOAD2_DATA32_LANE(v0, v1, 0)
    LOAD2_DATA32_LANE(v0, v1, 1)
    LOAD2_DATA32_LANE(v0, v1, 2)
    LOAD2_DATA32_LANE(v0, v1, 3)
    "shrn       v2.4h, v5.4s, #9               \n"
    "and        v2.8b, v2.8b, v4.8b            \n"
    "dup        v16.8b, v2.b[0]                \n"
    "dup        v17.8b, v2.b[2]                \n"
    "dup        v18.8b, v2.b[4]                \n"
    "dup        v19.8b, v2.b[6]                \n"
    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
    "ins        v2.d[1], v17.d[0]              \n"  // f
    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
    "umull      v16.8h, v0.8b, v7.8b           \n"
    "umull2     v17.8h, v0.16b, v7.16b         \n"
    "umull      v18.8h, v1.8b, v2.8b           \n"
    "umull2     v19.8h, v1.16b, v2.16b         \n"
    "add        v16.8h, v16.8h, v18.8h         \n"
    "add        v17.8h, v17.8h, v19.8h         \n"
    "shrn       v0.8b, v16.8h, #7              \n"
    "shrn2      v0.16b, v17.8h, #7             \n"
    MEMACCESS(0)
    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
    "add     v5.4s, v5.4s, v6.4s               \n"
    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
    "b.gt    1b                                \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width64),      // %2
    "+r"(x64),              // %3
    "+r"(dx64),             // %4
    "+r"(tmp),              // %5
    "+r"(src_tmp)           // %6
  :
  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
    "v6", "v7", "v16", "v17", "v18", "v19"
  );
 }
 #undef LOAD2_DATA32_LANE
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus
--- a/third_party/libyuv/source/scale_win.cc
+++ b/third_party/libyuv/source/scale_win.cc
@@ -9,6 +9,7 @@
 */
 #include "libyuv/row.h"
 #include "libyuv/scale_row.h"
 #ifdef __cplusplus
 namespace libyuv {
@@ -16,7 +17,8 @@ extern "C" {
 #endif
 // This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
    defined(_MSC_VER) && !defined(__clang__)
 // Offsets for source bytes 0 to 9
 static uvec8 kShuf0 =
@@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 // Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
@@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  __asm {
@@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
 __declspec(naked)
 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
    mov        eax, [esp + 4]        // src_ptr
                                     // src_stride ignored
    mov        edx, [esp + 12]       // dst_ptr
    mov        ecx, [esp + 16]       // dst_width
  wloop:
    vmovdqu     ymm0, [eax]
    vmovdqu     ymm1, [eax + 32]
    lea         eax,  [eax + 64]
    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
    vpsrlw      ymm1, ymm1, 8
    vpackuswb   ymm0, ymm0, ymm1
    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
    vmovdqu     [edx], ymm0
    lea         edx, [edx + 32]
    sub         ecx, 32
    jg          wloop
    vzeroupper
    ret
  }
 }
 // Blends 64x1 rectangle to 32x1.
 __declspec(naked)
 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                              uint8* dst_ptr, int dst_width) {
  __asm {
    mov         eax, [esp + 4]        // src_ptr
                                      // src_stride
    mov         edx, [esp + 12]       // dst_ptr
    mov         ecx, [esp + 16]       // dst_width
    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    vpsrlw      ymm4, ymm4, 15
    vpackuswb   ymm4, ymm4, ymm4
    vpxor       ymm5, ymm5, ymm5      // constant 0
  wloop:
    vmovdqu     ymm0, [eax]
    vmovdqu     ymm1, [eax + 32]
    lea         eax,  [eax + 64]
    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
    vpmaddubsw  ymm1, ymm1, ymm4
    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    vpavgw      ymm1, ymm1, ymm5
    vpackuswb   ymm0, ymm0, ymm1
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vmovdqu     [edx], ymm0
    lea         edx, [edx + 32]
    sub         ecx, 32
    jg          wloop
    vzeroupper
    ret
  }
 }
 // Blends 64x2 rectangle to 32x1.
 __declspec(naked)
 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
    push        esi
    mov         eax, [esp + 4 + 4]    // src_ptr
    mov         esi, [esp + 4 + 8]    // src_stride
    mov         edx, [esp + 4 + 12]   // dst_ptr
    mov         ecx, [esp + 4 + 16]   // dst_width
    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
    vpsrlw      ymm4, ymm4, 15
    vpackuswb   ymm4, ymm4, ymm4
    vpxor       ymm5, ymm5, ymm5      // constant 0
  wloop:
    vmovdqu     ymm0, [eax]           // average rows
    vmovdqu     ymm1, [eax + 32]
    vpavgb      ymm0, ymm0, [eax + esi]
    vpavgb      ymm1, ymm1, [eax + esi + 32]
    lea         eax,  [eax + 64]
    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
    vpmaddubsw  ymm1, ymm1, ymm4
    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
    vpavgw      ymm1, ymm1, ymm5
    vpackuswb   ymm0, ymm0, ymm1
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vmovdqu     [edx], ymm0
    lea         edx, [edx + 32]
    sub         ecx, 32
    jg          wloop
    pop         esi
    vzeroupper
    ret
  }
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
 // Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
@@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
@@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    psrlw      xmm7, 8
  wloop:
-    movdqu     xmm0, [eax]
+    movdqu     xmm0, [eax]           // average rows
    movdqu     xmm1, [eax + 16]
    movdqu     xmm2, [eax + esi]
    movdqu     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2
    pavgb      xmm1, xmm3
    movdqu     xmm2, [eax + esi * 2]
    movdqu     xmm3, [eax + esi * 2 + 16]
@@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }
 #ifdef HAS_SCALEROWDOWN4_AVX2
 // Point samples 64 pixels to 16 pixels.
 __declspec(naked)
 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                        uint8* dst_ptr, int dst_width) {
  __asm {
    mov         eax, [esp + 4]        // src_ptr
                                      // src_stride ignored
    mov         edx, [esp + 12]       // dst_ptr
    mov         ecx, [esp + 16]       // dst_width
    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
    vpsrld      ymm5, ymm5, 24
    vpslld      ymm5, ymm5, 16
  wloop:
    vmovdqu     ymm0, [eax]
    vmovdqu     ymm1, [eax + 32]
    lea         eax,  [eax + 64]
    vpand       ymm0, ymm0, ymm5
    vpand       ymm1, ymm1, ymm5
    vpackuswb   ymm0, ymm0, ymm1
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vpsrlw      ymm0, ymm0, 8
    vpackuswb   ymm0, ymm0, ymm0
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vmovdqu     [edx], xmm0
    lea         edx, [edx + 16]
    sub         ecx, 16
    jg          wloop
    vzeroupper
    ret
  }
 }
 // Blends 64x4 rectangle to 16x1.
 __declspec(naked)
 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  __asm {
    push        esi
    push        edi
    mov         eax, [esp + 8 + 4]    // src_ptr
    mov         esi, [esp + 8 + 8]    // src_stride
    mov         edx, [esp + 8 + 12]   // dst_ptr
    mov         ecx, [esp + 8 + 16]   // dst_width
    lea         edi, [esi + esi * 2]  // src_stride * 3
    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
    vpsrlw      ymm7, ymm7, 8
  wloop:
    vmovdqu     ymm0, [eax]           // average rows
    vmovdqu     ymm1, [eax + 32]
    vpavgb      ymm0, ymm0, [eax + esi]
    vpavgb      ymm1, ymm1, [eax + esi + 32]
    vmovdqu     ymm2, [eax + esi * 2]
    vmovdqu     ymm3, [eax + esi * 2 + 32]
    vpavgb      ymm2, ymm2, [eax + edi]
    vpavgb      ymm3, ymm3, [eax + edi + 32]
    lea         eax, [eax + 64]
    vpavgb      ymm0, ymm0, ymm2
    vpavgb      ymm1, ymm1, ymm3
    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
    vpand       ymm3, ymm1, ymm7
    vpsrlw      ymm0, ymm0, 8
    vpsrlw      ymm1, ymm1, 8
    vpavgw      ymm0, ymm0, ymm2
    vpavgw      ymm1, ymm1, ymm3
    vpackuswb   ymm0, ymm0, ymm1
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
    vpsrlw      ymm0, ymm0, 8
    vpavgw      ymm0, ymm0, ymm2
    vpackuswb   ymm0, ymm0, ymm0
    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
    vmovdqu     [edx], xmm0
    lea         edx, [edx + 16]
    sub         ecx, 16
    jg          wloop
    pop        edi
    pop        esi
    vzeroupper
    ret
  }
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
 // Point samples 32 pixels to 24 pixels.
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
-// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
  __asm {
@@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 // xmm7 kRound34
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 }
 // Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 // 3/8 point sampler
 // Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
  __asm {
@@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 }
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
@@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
  }
 }
-// Reads 16xN bytes and produces 16 shorts at a time.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
-// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
+__declspec(naked)
-__declspec(naked) __declspec(align(16))
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                       uint16* dst_ptr, int src_width,
                       int src_height) {
  __asm {
-    push       esi
+    mov        eax, [esp + 4]   // src_ptr
-    push       edi
+    mov        edx, [esp + 8]   // dst_ptr
-    push       ebx
+    mov        ecx, [esp + 12]  // src_width
-    push       ebp
+    pxor       xmm5, xmm5
    mov        esi, [esp + 16 + 4]   // src_ptr
    mov        edx, [esp + 16 + 8]   // src_stride
    mov        edi, [esp + 16 + 12]  // dst_ptr
    mov        ecx, [esp + 16 + 16]  // dst_width
    mov        ebx, [esp + 16 + 20]  // height
    pxor       xmm4, xmm4
    dec        ebx
  // sum rows
  xloop:
-    // first row
+    movdqu     xmm3, [eax]       // read 16 bytes
-    movdqu     xmm0, [esi]
+    lea        eax, [eax + 16]
-    lea        eax, [esi + edx]
+    movdqu     xmm0, [edx]       // read 16 words from destination
-    movdqa     xmm1, xmm0
+    movdqu     xmm1, [edx + 16]
-    punpcklbw  xmm0, xmm4
+    movdqa     xmm2, xmm3
-    punpckhbw  xmm1, xmm4
+    punpcklbw  xmm2, xmm5
-    lea        esi, [esi + 16]
+    punpckhbw  xmm3, xmm5
    mov        ebp, ebx
    test       ebp, ebp
    je         ydone
    // sum remaining rows
  yloop:
    movdqu     xmm2, [eax]       // read 16 pixels
    lea        eax, [eax + edx]  // advance to next row
    movdqa     xmm3, xmm2
    punpcklbw  xmm2, xmm4
    punpckhbw  xmm3, xmm4
    paddusw    xmm0, xmm2        // sum 16 words
    paddusw    xmm1, xmm3
-    sub        ebp, 1
+    movdqu     [edx], xmm0       // write 16 words to destination
-    jg         yloop
+    movdqu     [edx + 16], xmm1
-
+    lea        edx, [edx + 32]
  ydone:
    movdqu     [edi], xmm0
    movdqu     [edi + 16], xmm1
    lea        edi, [edi + 32]
    sub        ecx, 16
    jg         xloop
    pop        ebp
    pop        ebx
    pop        edi
    pop        esi
    ret
  }
 }
-// Bilinear column filtering. SSSE3 version.
+#ifdef HAS_SCALEADDROW_AVX2
-// TODO(fbarchard): Port to Neon
+// Reads 32 bytes and accumulates to 32 shorts at a time.
-// TODO(fbarchard): Switch the following:
+__declspec(naked)
-//    xor        ebx, ebx
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
+  __asm {
-// To
+    mov         eax, [esp + 4]   // src_ptr
-//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    mov         edx, [esp + 8]   // dst_ptr
-// when drmemory bug fixed.
+    mov         ecx, [esp + 12]  // src_width
-// https://code.google.com/p/drmemory/issues/detail?id=1396
+    vpxor       ymm5, ymm5, ymm5
-__declspec(naked) __declspec(align(16))
+  // sum rows
  xloop:
    vmovdqu     ymm3, [eax]       // read 32 bytes
    lea         eax, [eax + 32]
    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
    vpunpcklbw  ymm2, ymm3, ymm5
    vpunpckhbw  ymm3, ymm3, ymm5
    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
    vpaddusw    ymm1, ymm3, [edx + 32]
    vmovdqu     [edx], ymm0       // write 32 words to destination
    vmovdqu     [edx + 32], ymm1
    lea         edx, [edx + 64]
    sub         ecx, 32
    jg          xloop
    vzeroupper
    ret
  }
 }
 #endif  // HAS_SCALEADDROW_AVX2
 // Bilinear column filtering. SSSE3 version.
 __declspec(naked)
 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           int dst_width, int x, int dx) {
  __asm {
@@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                       int dst_width, int x, int dx) {
  __asm {
@@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
                            ptrdiff_t src_stride,
                            uint8* dst_argb, int dst_width) {
@@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 }
 // Blends 8x1 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  uint8* dst_argb, int dst_width) {
@@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 }
 // Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
                               ptrdiff_t src_stride,
                               uint8* dst_argb, int dst_width) {
@@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 }
 // Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
                               int src_stepx,
                               uint8* dst_argb, int dst_width) {
@@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 }
 // Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
                                  ptrdiff_t src_stride,
                                  int src_stepx,
@@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 }
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
                        int dst_width, int x, int dx) {
  __asm {
@@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
                               int dst_width, int x, int dx) {
  __asm {
@@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked)
 __declspec(naked) __declspec(align(16))
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                           int dst_width, int x, int dx) {
  __asm {
@@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 }
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv_X86(int num, int div) {
  __asm {
    mov        eax, [esp + 4]    // num
@@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
 }
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
+__declspec(naked)
 int FixedDiv1_X86(int num, int div) {
  __asm {
    mov        eax, [esp + 4]    // num
@@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
    ret
  }
 }
-
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #ifdef __cplusplus
 }  // extern "C"