Align image buffer in multiple-resolution encoder

Aligned the image buffer and stride to 32 bytes. This enables calling of optimized scaler function in libyuv, and improves the performance. Tested libyuv scaler(x86 optimization) on Linux and Windows, including: Linux 32/64bit, visual studio 32/64bit, Cygwin, and MinGW32. Also, fixed a wrong pointer in vpx_codec_encode(). Change-Id: Ibe97d7a0a745f82c43852fa4ed719be5a4db6abc
2011-12-08 12:31:01 -05:00
parent 254889cdfc
commit 153eec46e0
9 changed files with 348 additions and 295 deletions
--- a/third_party/libyuv/README.webm
+++ b/third_party/libyuv/README.webm
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 90
+Version: 102
 License: BSD
 License File: LICENSE
--- a/third_party/libyuv/include/libyuv/basic_types.h
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@@ -13,21 +13,12 @@
 #include <stddef.h>  // for NULL, size_t
-#ifndef WIN32
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
 #include <stdint.h>  // for uintptr_t
 #endif
 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
 #ifdef COMPILER_MSVC
 typedef __int64 int64;
 #else
 typedef long long int64;
 #endif /* COMPILER_MSVC */
 typedef int int32;
 typedef short int16;
 typedef char int8;
 #ifdef COMPILER_MSVC
 typedef unsigned __int64 uint64;
 typedef __int64 int64;
@@ -38,9 +29,20 @@ typedef __int64 int64;
 #define UINT64_C(x) x ## UI64
 #endif
 #define INT64_F "I64"
-#else
+#else  // COMPILER_MSVC
 #ifdef __LP64__
 typedef unsigned long uint64;
 typedef long int64;
 #ifndef INT64_C
 #define INT64_C(x) x ## L
 #endif
 #ifndef UINT64_C
 #define UINT64_C(x) x ## UL
 #endif
 #define INT64_F "l"
 #else  // __LP64__
 typedef unsigned long long uint64;
-//typedef long long int64;
+typedef long long int64;
 #ifndef INT64_C
 #define INT64_C(x) x ## LL
 #endif
@@ -48,10 +50,14 @@ typedef unsigned long long uint64;
 #define UINT64_C(x) x ## ULL
 #endif
 #define INT64_F "ll"
-#endif /* COMPILER_MSVC */
+#endif  // __LP64__
 #endif  // COMPILER_MSVC
 typedef unsigned int uint32;
 typedef int int32;
 typedef unsigned short uint16;
 typedef short int16;
 typedef unsigned char uint8;
 typedef char int8;
 #endif  // INT_TYPES_DEFINED
 // Detect compiler is for x86 or x64.
@@ -60,7 +66,6 @@ typedef unsigned char uint8;
 #define CPU_X86 1
 #endif
 #define IS_ALIGNED(p, a) (0==((uintptr_t)(p) & ((a)-1)))
 #define ALIGNP(p, t) \
  ((uint8*)((((uintptr_t)(p) + \
  ((t)-1)) & ~((t)-1))))
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -11,21 +11,39 @@
 #ifndef INCLUDE_LIBYUV_CPU_ID_H_
 #define INCLUDE_LIBYUV_CPU_ID_H_
-//namespace libyuv {
+#ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // These flags are only valid on x86 processors
 static const int kCpuHasSSE2 = 1;
 static const int kCpuHasSSSE3 = 2;
-// SIMD support on ARM processors
+// These flags are only valid on ARM processors
 static const int kCpuHasNEON = 4;
 // Internal flag to indicate cpuid is initialized.
 static const int kCpuInitialized = 8;
 // Detect CPU has SSE2 etc.
-int TestCpuFlag(int flag);
+// test_flag parameter should be one of kCpuHas constants above
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
  extern int cpu_info_;
  extern int InitCpuFlags();
  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
 }
 // For testing, allow CPU flags to be disabled.
-void MaskCpuFlagsForTest(int enable_flags);
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // -1 to enable all cpu specific optimizations.
 // 0 to disable all cpu specific optimizations.
 void MaskCpuFlags(int enable_flags);
-//}  // namespace libyuv
+#ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 #endif  // INCLUDE_LIBYUV_CPU_ID_H_
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -13,7 +13,10 @@
 #include "third_party/libyuv/include/libyuv/basic_types.h"
-//namespace libyuv {
+#ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // Supported filtering
 typedef enum {
@@ -42,16 +45,8 @@ int I420Scale(const uint8* src_y, int src_stride_y,
              int dst_width, int dst_height,
              FilterMode filtering);
-// Legacy API
+// Legacy API.  Deprecated
-// If dst_height_offset is non-zero, the image is offset by that many pixels
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
 // and stretched to (dst_height - dst_height_offset * 2) pixels high,
 // instead of dst_height.
 int Scale_1(const uint8* src, int src_width, int src_height,
          uint8* dst, int dst_width, int dst_height, int dst_height_offset,
          int interpolate);
 // Same, but specified src terms of each plane location and stride.
 int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int src_stride_y, int src_stride_u, int src_stride_v,
          int src_width, int src_height,
          uint8* dst_y, uint8* dst_u, uint8* dst_v,
@@ -59,9 +54,17 @@ int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int dst_width, int dst_height,
          int interpolate);
 // Legacy API.  Deprecated
 int ScaleOffset(const uint8* src, int src_width, int src_height,
                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
                int interpolate);
 // For testing, allow disabling of optimizations.
 void SetUseReferenceImpl(int use);
-//} // namespace libyuv
+#ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 #endif // INCLUDE_LIBYUV_SCALE_H_
--- a/third_party/libyuv/source/cpu_id.c
+++ b/third_party/libyuv/source/cpu_id.c
@@ -9,66 +9,73 @@
 */
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 #include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 #ifdef __ANDROID__
 #include <cpu-features.h>
 #endif
 #include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86
 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
+  asm volatile (
-    "mov %%ebx, %%edi\n"
+    "mov %%ebx, %%edi                          \n"
-    "cpuid\n"
+    "cpuid                                     \n"
-    "xchg %%edi, %%ebx\n"
+    "xchg %%edi, %%ebx                         \n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type)
  );
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
+  asm volatile (
-    "cpuid\n"
+    "cpuid                                     \n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type)
  );
 }
 #endif
-//namespace libyuv {
+#ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // CPU detect function for SIMD instruction sets.
-static int cpu_info_initialized_ = 0;
+int cpu_info_ = 0;
 static int cpu_info_ = 0;
-// Global lock for cpu initialization.
+int InitCpuFlags() {
 static void InitCpuFlags() {
 #ifdef CPU_X86
  int cpu_info[4];
  __cpuid(cpu_info, 1);
-  cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
-              (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0);
+              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
              kCpuInitialized;
 #elif defined(__ANDROID__) && defined(__ARM_NEON__)
  uint64_t features = android_getCpuFeatures();
  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
              kCpuInitialized;
 #elif defined(__ARM_NEON__)
  // gcc -mfpu=neon defines __ARM_NEON__
-  // if code is specifically built for Neon-only, enable the flag.
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
-  cpu_info_ |= kCpuHasNEON;
+  // to disable Neon on devices that do not have it.
  cpu_info_ = kCpuHasNEON | kCpuInitialized;
 #else
-  cpu_info_ = 0;
+  cpu_info_ = kCpuInitialized;
 #endif
-  cpu_info_initialized_ = 1;
+  return cpu_info_;
 }
-void MaskCpuFlagsForTest(int enable_flags) {
+void MaskCpuFlags(int enable_flags) {
  InitCpuFlags();
-  cpu_info_ &= enable_flags;
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
 }
-int TestCpuFlag(int flag) {
+#ifdef __cplusplus
-  if (!cpu_info_initialized_) {
+}  // extern "C"
-    InitCpuFlags();
+}  // namespace libyuv
-  }
+#endif
  return cpu_info_ & flag ? 1 : 0;
 }
 //}  // namespace libyuv
--- a/third_party/libyuv/source/row.h
+++ b/third_party/libyuv/source/row.h
@@ -14,7 +14,7 @@
 #include "third_party/libyuv/include/libyuv/basic_types.h"
 #define kMaxStride (2048 * 4)
-//#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
 #if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
 #define YUV_DISABLE_ASM
@@ -72,7 +72,10 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
 #define HAS_REVERSE_ROW_NEON
 #endif
-//extern "C" {
+#ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -253,6 +256,9 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
 #endif
-//}  // extern "C"
+#ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 #endif  // LIBYUV_SOURCE_ROW_H_
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@@ -15,6 +15,17 @@
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 #include "third_party/libyuv/source/row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 /*
 * Note: Defining YUV_DISABLE_ASM allows to use c version.
 */
 //#define YUV_DISABLE_ASM
 #if defined(_MSC_VER)
 #define ALIGN16(var) __declspec(align(16)) var
 #else
@@ -26,8 +37,6 @@
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
 //namespace libyuv {
 // Set the following flag to true to revert to only
 // using the reference implementation ScalePlaneBox(), and
 // NOT the optimized versions. Useful for debugging and
@@ -40,9 +49,7 @@ void SetUseReferenceImpl(int use) {
  use_reference_impl_ = use;
 }
-// TODO: The preprocessor definitions for Win64 are not right in build system.
+// ScaleRowDown2Int also used by planar functions
 // Disable optimized code for now.
 #define YUV_DISABLE_ASM
 /**
 * NEON downscalers with interpolation.
@@ -511,83 +518,116 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
    !defined(YUV_DISABLE_ASM)
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(OSX) && defined(__i386__)
+#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
 #endif
 #if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
    defined(__i386__)
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".globl _" #name "                         \n"                             \
 "_" #name ":                                   \n"
 #else
 #define DECLARE_FUNCTION(name)                                                 \
    ".text                                     \n"                             \
    ".global " #name "                         \n"                             \
 #name ":                                       \n"
 #endif
 // Offsets for source bytes 0 to 9
 //extern "C"
 TALIGN16(const uint8, shuf0[16]) =
  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
 //extern "C"
 TALIGN16(const uint8, shuf1[16]) =
  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
 //extern "C"
 TALIGN16(const uint8, shuf2[16]) =
  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
 // Offsets for source bytes 0 to 10
 //extern "C"
 TALIGN16(const uint8, shuf01[16]) =
  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
 //extern "C"
 TALIGN16(const uint8, shuf11[16]) =
  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
 //extern "C"
 TALIGN16(const uint8, shuf21[16]) =
  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
 // Coefficients for source bytes 0 to 10
 //extern "C"
 TALIGN16(const uint8, madd01[16]) =
  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
 // Coefficients for source bytes 10 to 21
 //extern "C"
 TALIGN16(const uint8, madd11[16]) =
  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
 // Coefficients for source bytes 21 to 31
 //extern "C"
 TALIGN16(const uint8, madd21[16]) =
  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
 // Coefficients for source bytes 21 to 31
 //extern "C"
 TALIGN16(const int16, round34[8]) =
  { 2, 2, 2, 2, 2, 2, 2, 2 };
 //extern "C"
 TALIGN16(const uint8, shuf38a[16]) =
  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 //extern "C"
 TALIGN16(const uint8, shuf38b[16]) =
  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
 // Arrange words 0,3,6 into 0,1,2
 //extern "C"
 TALIGN16(const uint8, shufac0[16]) =
  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
 // Arrange words 0,3,6 into 3,4,5
 //extern "C"
 TALIGN16(const uint8, shufac3[16]) =
  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
 // Scaling values for boxes of 3x3 and 2x3
 //extern "C"
 TALIGN16(const uint16, scaleac3[8]) =
  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
 // Arrange first value for pixels 0,1,2,3,4,5
 //extern "C"
 TALIGN16(const uint8, shufab0[16]) =
  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
 // Arrange second value for pixels 0,1,2,3,4,5
 //extern "C"
 TALIGN16(const uint8, shufab1[16]) =
  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
 // Arrange third value for pixels 0,1,2,3,4,5
 //extern "C"
 TALIGN16(const uint8, shufab2[16]) =
  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
 // Scaling values for boxes of 3x2 and 2x2
 //extern "C"
 TALIGN16(const uint16, scaleab2[8]) =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
@@ -1620,14 +1660,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
 #if defined(OSX)
    ".globl _ScaleRowDown8Int_SSE2             \n"
 "_ScaleRowDown8Int_SSE2:                       \n"
 #else
    ".global ScaleRowDown8Int_SSE2             \n"
 "ScaleRowDown8Int_SSE2:                        \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebx                    \n"
@@ -1691,14 +1724,7 @@ void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
                                     uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown34_SSSE3              \n"
 "_ScaleRowDown34_SSSE3:                        \n"
 #else
    ".global ScaleRowDown34_SSSE3              \n"
 "ScaleRowDown34_SSSE3:                         \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x2c(%esp),%edi                    \n"
@@ -1729,14 +1755,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown34_1_Int_SSSE3        \n"
 "_ScaleRowDown34_1_Int_SSSE3:                  \n"
 #else
    ".global ScaleRowDown34_1_Int_SSSE3        \n"
 "ScaleRowDown34_1_Int_SSSE3:                   \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebp                    \n"
@@ -1790,14 +1809,7 @@ void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown34_0_Int_SSSE3        \n"
 "_ScaleRowDown34_0_Int_SSSE3:                  \n"
 #else
    ".global ScaleRowDown34_0_Int_SSSE3        \n"
 "ScaleRowDown34_0_Int_SSSE3:                   \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebp                    \n"
@@ -1854,14 +1866,7 @@ void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                     uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown38_SSSE3              \n"
 "_ScaleRowDown38_SSSE3:                        \n"
 #else
    ".global ScaleRowDown38_SSSE3              \n"
 "ScaleRowDown38_SSSE3:                         \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@@ -1890,14 +1895,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown38_3_Int_SSSE3        \n"
 "_ScaleRowDown38_3_Int_SSSE3:                  \n"
 #else
    ".global ScaleRowDown38_3_Int_SSSE3        \n"
 "ScaleRowDown38_3_Int_SSSE3:                   \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@@ -1954,14 +1952,7 @@ void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
 #if defined(OSX)
    ".globl _ScaleRowDown38_2_Int_SSSE3        \n"
 "_ScaleRowDown38_2_Int_SSSE3:                  \n"
 #else
    ".global ScaleRowDown38_2_Int_SSSE3        \n"
 "ScaleRowDown38_2_Int_SSSE3:                   \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@@ -2001,14 +1992,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                                  uint16* dst_ptr, int src_width,
                                  int src_height);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleAddRows_SSE2)
 #if defined(OSX)
    ".globl _ScaleAddRows_SSE2                 \n"
 "_ScaleAddRows_SSE2:                           \n"
 #else
    ".global ScaleAddRows_SSE2                 \n"
 "ScaleAddRows_SSE2:                            \n"
 #endif
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@@ -2052,14 +2036,7 @@ void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                     const uint8* src_ptr, int src_stride,
                                     int dst_width, int source_y_fraction);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleFilterRows_SSE2)
 #if defined(OSX)
    ".globl _ScaleFilterRows_SSE2              \n"
 "_ScaleFilterRows_SSE2:                        \n"
 #else
    ".global ScaleFilterRows_SSE2              \n"
 "ScaleFilterRows_SSE2:                         \n"
 #endif
    "push   %esi                               \n"
    "push   %edi                               \n"
    "mov    0xc(%esp),%edi                     \n"
@@ -2147,14 +2124,7 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                      const uint8* src_ptr, int src_stride,
                                      int dst_width, int source_y_fraction);
  asm(
-    ".text                                     \n"
+    DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
 #if defined(OSX)
    ".globl _ScaleFilterRows_SSSE3             \n"
 "_ScaleFilterRows_SSSE3:                       \n"
 #else
    ".global ScaleFilterRows_SSSE3             \n"
 "ScaleFilterRows_SSSE3:                        \n"
 #endif
    "push   %esi                               \n"
    "push   %edi                               \n"
    "mov    0xc(%esp),%edi                     \n"
@@ -2318,7 +2288,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
-  asm volatile(
+  asm volatile (
  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
@@ -2436,7 +2406,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN38_SSSE3
 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
-  asm volatile(
+  asm volatile (
  "movdqa     (%3),%%xmm4                      \n"
  "movdqa     (%4),%%xmm5                      \n"
 "1:"
@@ -2560,7 +2530,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                              uint16* dst_ptr, int src_width,
                              int src_height) {
-  asm volatile(
+  asm volatile (
  "pxor       %%xmm5,%%xmm5                    \n"
 "1:"
  "movdqa     (%0),%%xmm2                      \n"
@@ -2602,7 +2572,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                 const uint8* src_ptr, int src_stride,
                                 int dst_width, int source_y_fraction) {
  if (source_y_fraction == 0) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@@ -2620,7 +2590,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    );
    return;
  } else if (source_y_fraction == 128) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@@ -2640,7 +2610,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    );
    return;
  } else {
-    asm volatile(
+    asm volatile (
      "mov        %3,%%eax                     \n"
      "movd       %%eax,%%xmm6                 \n"
      "punpcklwd  %%xmm6,%%xmm6                \n"
@@ -2693,7 +2663,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  const uint8* src_ptr, int src_stride,
                                  int dst_width, int source_y_fraction) {
  if (source_y_fraction == 0) {
-    asm volatile(
+    asm volatile (
   "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@@ -2711,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    );
    return;
  } else if (source_y_fraction == 128) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@@ -2731,7 +2701,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    );
    return;
  } else {
-    asm volatile(
+    asm volatile (
      "mov        %3,%%eax                     \n"
      "shr        %%eax                        \n"
      "mov        %%al,%%ah                    \n"
@@ -3095,10 +3065,7 @@ static void ScalePlaneDown2(int src_width, int src_height,
    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
  } else
 #endif
-/* TODO: Force to call C version all the time in ordert to get matching results
+#if defined(HAS_SCALEROWDOWN2_SSE2)
 * in multi-resolution encoder example.
 */
 #if 0 //defined(HAS_SCALEROWDOWN2_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(dst_width, 16) &&
      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
@@ -3292,7 +3259,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
      src_row = 0;
    }
  }
-  }
+}
 }
 /**
@@ -3369,7 +3336,7 @@ static void ScalePlaneDown38(int src_width, int src_height,
    }
    dst_ptr += dst_stride;
  }
-  }
+}
 }
 __inline static uint32 SumBox(int iboxwidth, int iboxheight,
@@ -3630,7 +3597,7 @@ static void ScalePlaneBilinear(int src_width, int src_height,
      }
    }
  }
-  }
+}
 }
 /**
@@ -3818,36 +3785,32 @@ int I420Scale(const uint8* src_y, int src_stride_y,
    src_stride_v = -src_stride_v;
  }
  {
-    int halfsrc_width = (src_width + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
-    int halfsrc_height = (src_height + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
-    int halfdst_width = (dst_width + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
-    int halfoheight = (dst_height + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
    ScalePlane(src_y, src_stride_y, src_width, src_height,
               dst_y, dst_stride_y, dst_width, dst_height,
               filtering, use_reference_impl_);
-    ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-               dst_u, dst_stride_u, halfdst_width, halfoheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
-    ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-               dst_v, dst_stride_v, halfdst_width, halfoheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
  }
  return 0;
 }
-int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+// Deprecated api
 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int src_stride_y, int src_stride_u, int src_stride_v,
          int src_width, int src_height,
          uint8* dst_y, uint8* dst_u, uint8* dst_v,
          int dst_stride_y, int dst_stride_u, int dst_stride_v,
          int dst_width, int dst_height,
          int interpolate) {
  int halfsrc_width;
  int halfsrc_height;
  int halfdst_width;
  int halfoheight;
  FilterMode filtering;
  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
    return -1;
@@ -3864,51 +3827,58 @@ int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
-  halfsrc_width = (src_width + 1) >> 1;
+  {
-  halfsrc_height = (src_height + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
-  halfdst_width = (dst_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
-  halfoheight = (dst_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
-  filtering = interpolate ? kFilterBox : kFilterNone;
+  int dst_halfheight = (dst_height + 1) >> 1;
  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
  ScalePlane(src_y, src_stride_y, src_width, src_height,
             dst_y, dst_stride_y, dst_width, dst_height,
             filtering, use_reference_impl_);
-  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, halfdst_width, halfoheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
-  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, halfdst_width, halfoheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
  }
  return 0;
 }
-int Scale_1(const uint8* src, int src_width, int src_height,
+// Deprecated api
-          uint8* dst, int dst_width, int dst_height, int ooffset,
+int ScaleOffset(const uint8* src, int src_width, int src_height,
                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
          int interpolate) {
  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
-      ooffset >= dst_height) {
+      dst_yoffset >= dst_height) {
    return -1;
  }
-  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
+  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
  {
-    int halfsrc_width = (src_width + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
-    int halfsrc_height = (src_height + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
-    int halfdst_width = (dst_width + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
-    int halfoheight = (dst_height + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
-    int aheight = dst_height - ooffset * 2;  // actual output height
+  int aheight = dst_height - dst_yoffset * 2;  // actual output height
-    const uint8* const iyptr = src;
+  const uint8* const src_y = src;
-    uint8* oyptr = dst + ooffset * dst_width;
+  const uint8* const src_u = src + src_width * src_height;
-    const uint8* const iuptr = src + src_width * src_height;
+  const uint8* const src_v = src + src_width * src_height +
-    uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
+                             src_halfwidth * src_halfheight;
-    const uint8* const ivptr = src + src_width * src_height +
+  uint8* dst_y = dst + dst_yoffset * dst_width;
-                               halfsrc_width * halfsrc_height;
+  uint8* dst_u = dst + dst_width * dst_height +
-    uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
+                 (dst_yoffset >> 1) * dst_halfwidth;
-                   (ooffset >> 1) * halfdst_width;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-    return Scale_2(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
+                 (dst_yoffset >> 1) * dst_halfwidth;
-                 src_width, src_height, oyptr, ouptr, ovptr, dst_width,
+  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
-                 halfdst_width, halfdst_width, dst_width, aheight, interpolate);
+               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
  }
 }
-//}  // namespace libyuv
+#ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/vp8_multi_resolution_encoder.c
+++ b/vp8_multi_resolution_encoder.c
@@ -78,6 +78,8 @@ static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
    exit(EXIT_FAILURE);
 }
 int (*read_frame_p)(FILE *f, vpx_image_t *img);
 static int read_frame(FILE *f, vpx_image_t *img) {
    size_t nbytes, to_read;
    int    res = 1;
@@ -92,6 +94,55 @@ static int read_frame(FILE *f, vpx_image_t *img) {
    return res;
 }
 static int read_frame_by_row(FILE *f, vpx_image_t *img) {
    size_t nbytes, to_read;
    int    res = 1;
    int plane;
    for (plane = 0; plane < 3; plane++)
    {
        unsigned char *ptr;
        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
        int r;
        /* Determine the correct plane based on the image format. The for-loop
         * always counts in Y,U,V order, but this may not match the order of
         * the data on disk.
         */
        switch (plane)
        {
        case 1:
            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
            break;
        case 2:
            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
            break;
        default:
            ptr = img->planes[plane];
        }
        for (r = 0; r < h; r++)
        {
            to_read = w;
            nbytes = fread(ptr, 1, to_read, f);
            if(nbytes != to_read) {
                res = 0;
                if(nbytes > 0)
                    printf("Warning: Read partial frame. Check your width & height!\n");
                break;
            }
            ptr += img->stride[plane];
        }
        if (!res)
            break;
    }
    return res;
 }
 static void write_ivf_file_header(FILE *outfile,
                                  const vpx_codec_enc_cfg_t *cfg,
                                  int frame_cnt) {
@@ -262,9 +313,14 @@ int main(int argc, char **argv)
    /* Allocate image for each encoder */
    for (i=0; i< NUM_ENCODERS; i++)
-        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 1))
+        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
        read_frame_p = read_frame;
    else
        read_frame_p = read_frame_by_row;
    for (i=0; i< NUM_ENCODERS; i++)
        write_ivf_file_header(outfile[i], &cfg[i], 0);
@@ -305,35 +361,22 @@ int main(int argc, char **argv)
        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
        flags = 0;
-        frame_avail = read_frame(infile, &raw[0]);
+        frame_avail = read_frame_p(infile, &raw[0]);
        for ( i=1; i<NUM_ENCODERS; i++)
        {
        if(frame_avail)
        {
            for ( i=1; i<NUM_ENCODERS; i++)
            {
                /*Scale the image down a number of times by downsampling factor*/
                int src_uvwidth = (raw[i-1].d_w + 1) >> 1;
                int src_uvheight = (raw[i-1].d_h + 1) >> 1;
                const unsigned char* src_y = raw[i-1].planes[VPX_PLANE_Y];
                const unsigned char* src_u = raw[i-1].planes[VPX_PLANE_Y]
                                             + raw[i-1].d_w*raw[i-1].d_h;
                const unsigned char* src_v = raw[i-1].planes[VPX_PLANE_Y]
                                             + raw[i-1].d_w*raw[i-1].d_h
                                             + src_uvwidth*src_uvheight;
                int dst_uvwidth = (raw[i].d_w + 1) >> 1;
                int dst_uvheight = (raw[i].d_h + 1) >> 1;
                unsigned char* dst_y = raw[i].planes[VPX_PLANE_Y];
                unsigned char* dst_u = raw[i].planes[VPX_PLANE_Y]
                                       + raw[i].d_w*raw[i].d_h;
                unsigned char* dst_v = raw[i].planes[VPX_PLANE_Y]
                                       + raw[i].d_w*raw[i].d_h
                                       + dst_uvwidth*dst_uvheight;
                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
-                I420Scale(src_y, raw[i-1].d_w, src_u, src_uvwidth, src_v,
+                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
-                          src_uvwidth, raw[i-1].d_w, raw[i-1].d_h,
+                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
-                          dst_y, raw[i].d_w, dst_u, dst_uvwidth,
+                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
-                          dst_v, dst_uvwidth, raw[i].d_w, raw[i].d_h, 1);
+                          raw[i-1].d_w, raw[i-1].d_h,
                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
                          raw[i].d_w, raw[i].d_h, 1);
            }
        }
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -243,6 +243,7 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
                ctx--;
                if (img) img--;
            }
            ctx++;
        }
        FLOATING_POINT_RESTORE();