Merge "Align image buffer in multiple-resolution encoder"

2011-12-13 10:39:44 -08:00 · 2011-12-13 10:39:44 -08:00 · 72af533f79
commit 72af533f79
parent 6b2792b0e0 153eec46e0
9 changed files with 348 additions and 295 deletions
--- a/third_party/libyuv/README.webm
+++ b/third_party/libyuv/README.webm
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 90
+Version: 102
 License: BSD
 License File: LICENSE

--- a/third_party/libyuv/include/libyuv/basic_types.h
+++ b/third_party/libyuv/include/libyuv/basic_types.h
@ -13,21 +13,12 @@

 #include <stddef.h>  // for NULL, size_t

-#ifndef WIN32
+#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
 #include <stdint.h>  // for uintptr_t
 #endif

 #ifndef INT_TYPES_DEFINED
 #define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef __int64 int64;
-#else
-typedef long long int64;
-#endif /* COMPILER_MSVC */
-typedef int int32;
-typedef short int16;
-typedef char int8;
-
 #ifdef COMPILER_MSVC
 typedef unsigned __int64 uint64;
 typedef __int64 int64;
@ -38,9 +29,20 @@ typedef __int64 int64;
 #define UINT64_C(x) x ## UI64
 #endif
 #define INT64_F "I64"
-#else
+#else  // COMPILER_MSVC
+#ifdef __LP64__
+typedef unsigned long uint64;
+typedef long int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // __LP64__
 typedef unsigned long long uint64;
-//typedef long long int64;
+typedef long long int64;
 #ifndef INT64_C
 #define INT64_C(x) x ## LL
 #endif
@ -48,10 +50,14 @@ typedef unsigned long long uint64;
 #define UINT64_C(x) x ## ULL
 #endif
 #define INT64_F "ll"
-#endif /* COMPILER_MSVC */
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
 typedef unsigned int uint32;
+typedef int int32;
 typedef unsigned short uint16;
+typedef short int16;
 typedef unsigned char uint8;
+typedef char int8;
 #endif  // INT_TYPES_DEFINED

 // Detect compiler is for x86 or x64.
@ -60,7 +66,6 @@ typedef unsigned char uint8;
 #define CPU_X86 1
 #endif

-#define IS_ALIGNED(p, a) (0==((uintptr_t)(p) & ((a)-1)))
 #define ALIGNP(p, t) \
  ((uint8*)((((uintptr_t)(p) + \
  ((t)-1)) & ~((t)-1))))
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@ -11,21 +11,39 @@
 #ifndef INCLUDE_LIBYUV_CPU_ID_H_
 #define INCLUDE_LIBYUV_CPU_ID_H_

-//namespace libyuv {
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif

 // These flags are only valid on x86 processors
 static const int kCpuHasSSE2 = 1;
 static const int kCpuHasSSSE3 = 2;

-// SIMD support on ARM processors
+// These flags are only valid on ARM processors
 static const int kCpuHasNEON = 4;

+// Internal flag to indicate cpuid is initialized.
+static const int kCpuInitialized = 8;
+
 // Detect CPU has SSE2 etc.
-int TestCpuFlag(int flag);
+// test_flag parameter should be one of kCpuHas constants above
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  extern int cpu_info_;
+  extern int InitCpuFlags();
+  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
+}

 // For testing, allow CPU flags to be disabled.
-void MaskCpuFlagsForTest(int enable_flags);
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// -1 to enable all cpu specific optimizations.
+// 0 to disable all cpu specific optimizations.
+void MaskCpuFlags(int enable_flags);

-//}  // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

 #endif  // INCLUDE_LIBYUV_CPU_ID_H_
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@ -13,7 +13,10 @@

 #include "third_party/libyuv/include/libyuv/basic_types.h"

-//namespace libyuv {
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif

 // Supported filtering
 typedef enum {
@ -42,16 +45,8 @@ int I420Scale(const uint8* src_y, int src_stride_y,
              int dst_width, int dst_height,
              FilterMode filtering);

-// Legacy API
-// If dst_height_offset is non-zero, the image is offset by that many pixels
-// and stretched to (dst_height - dst_height_offset * 2) pixels high,
-// instead of dst_height.
-int Scale_1(const uint8* src, int src_width, int src_height,
-          uint8* dst, int dst_width, int dst_height, int dst_height_offset,
-          int interpolate);
-
-// Same, but specified src terms of each plane location and stride.
-int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+// Legacy API.  Deprecated
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int src_stride_y, int src_stride_u, int src_stride_v,
          int src_width, int src_height,
          uint8* dst_y, uint8* dst_u, uint8* dst_v,
@ -59,9 +54,17 @@ int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int dst_width, int dst_height,
          int interpolate);

+// Legacy API.  Deprecated
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                int interpolate);
+
 // For testing, allow disabling of optimizations.
 void SetUseReferenceImpl(int use);

-//} // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

 #endif // INCLUDE_LIBYUV_SCALE_H_
--- a/third_party/libyuv/source/cpu_id.c
+++ b/third_party/libyuv/source/cpu_id.c
@ -9,66 +9,73 @@
 */

 #include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86

 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
+#ifdef __ANDROID__
+#include <cpu-features.h>
+#endif
+
+#include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86

 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "mov %%ebx, %%edi\n"
-    "cpuid\n"
-    "xchg %%edi, %%ebx\n"
+  asm volatile (
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type)
  );
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static inline void __cpuid(int cpu_info[4], int info_type) {
-  __asm__ volatile (
-    "cpuid\n"
+  asm volatile (
+    "cpuid                                     \n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type)
  );
 }
 #endif

-//namespace libyuv {
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif

 // CPU detect function for SIMD instruction sets.
-static int cpu_info_initialized_ = 0;
-static int cpu_info_ = 0;
+int cpu_info_ = 0;

-// Global lock for cpu initialization.
-static void InitCpuFlags() {
+int InitCpuFlags() {
 #ifdef CPU_X86
  int cpu_info[4];
  __cpuid(cpu_info, 1);
-  cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
-              (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0);
+  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
+              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
+              kCpuInitialized;
+#elif defined(__ANDROID__) && defined(__ARM_NEON__)
+  uint64_t features = android_getCpuFeatures();
+  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
+              kCpuInitialized;
 #elif defined(__ARM_NEON__)
  // gcc -mfpu=neon defines __ARM_NEON__
-  // if code is specifically built for Neon-only, enable the flag.
-  cpu_info_ |= kCpuHasNEON;
+  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+  // to disable Neon on devices that do not have it.
+  cpu_info_ = kCpuHasNEON | kCpuInitialized;
 #else
-  cpu_info_ = 0;
+  cpu_info_ = kCpuInitialized;
 #endif
-  cpu_info_initialized_ = 1;
+  return cpu_info_;
 }

-void MaskCpuFlagsForTest(int enable_flags) {
+void MaskCpuFlags(int enable_flags) {
  InitCpuFlags();
-  cpu_info_ &= enable_flags;
+  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
 }

-int TestCpuFlag(int flag) {
-  if (!cpu_info_initialized_) {
-    InitCpuFlags();
-  }
-  return cpu_info_ & flag ? 1 : 0;
-}
-
-//}  // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/third_party/libyuv/source/row.h
+++ b/third_party/libyuv/source/row.h
@ -14,7 +14,7 @@
 #include "third_party/libyuv/include/libyuv/basic_types.h"

 #define kMaxStride (2048 * 4)
-//#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))

 #if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
 #define YUV_DISABLE_ASM
@ -72,7 +72,10 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
 #define HAS_REVERSE_ROW_NEON
 #endif

-//extern "C" {
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif

 #ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@ -253,6 +256,9 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,

 #endif

-//}  // extern "C"
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif

 #endif  // LIBYUV_SOURCE_ROW_H_
--- a/third_party/libyuv/source/scale.c
+++ b/third_party/libyuv/source/scale.c
@ -15,6 +15,17 @@

 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 #include "third_party/libyuv/source/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+/*
+ * Note: Defining YUV_DISABLE_ASM allows to use c version.
+ */
+//#define YUV_DISABLE_ASM
+
 #if defined(_MSC_VER)
 #define ALIGN16(var) __declspec(align(16)) var
 #else
@ -26,8 +37,6 @@
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf

-//namespace libyuv {
-
 // Set the following flag to true to revert to only
 // using the reference implementation ScalePlaneBox(), and
 // NOT the optimized versions. Useful for debugging and
@ -40,9 +49,7 @@ void SetUseReferenceImpl(int use) {
  use_reference_impl_ = use;
 }

-// TODO: The preprocessor definitions for Win64 are not right in build system.
-// Disable optimized code for now.
-#define YUV_DISABLE_ASM
+// ScaleRowDown2Int also used by planar functions

 /**
 * NEON downscalers with interpolation.
@ -511,83 +518,116 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
    !defined(YUV_DISABLE_ASM)
 #if defined(_MSC_VER)
 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
-#elif defined(OSX) && defined(__i386__)
+#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
 #else
 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
 #endif

+#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
+    defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".globl _" #name "                         \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".global " #name "                         \n"                             \
+#name ":                                       \n"
+#endif
+
+
 // Offsets for source bytes 0 to 9
+//extern "C"
 TALIGN16(const uint8, shuf0[16]) =
  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+//extern "C"
 TALIGN16(const uint8, shuf1[16]) =
  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
 TALIGN16(const uint8, shuf2[16]) =
  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

 // Offsets for source bytes 0 to 10
+//extern "C"
 TALIGN16(const uint8, shuf01[16]) =
  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+//extern "C"
 TALIGN16(const uint8, shuf11[16]) =
  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+//extern "C"
 TALIGN16(const uint8, shuf21[16]) =
  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

 // Coefficients for source bytes 0 to 10
+//extern "C"
 TALIGN16(const uint8, madd01[16]) =
  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

 // Coefficients for source bytes 10 to 21
+//extern "C"
 TALIGN16(const uint8, madd11[16]) =
  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

 // Coefficients for source bytes 21 to 31
+//extern "C"
 TALIGN16(const uint8, madd21[16]) =
  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

 // Coefficients for source bytes 21 to 31
+//extern "C"
 TALIGN16(const int16, round34[8]) =
  { 2, 2, 2, 2, 2, 2, 2, 2 };

+//extern "C"
 TALIGN16(const uint8, shuf38a[16]) =
  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+//extern "C"
 TALIGN16(const uint8, shuf38b[16]) =
  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

 // Arrange words 0,3,6 into 0,1,2
+//extern "C"
 TALIGN16(const uint8, shufac0[16]) =
  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

 // Arrange words 0,3,6 into 3,4,5
+//extern "C"
 TALIGN16(const uint8, shufac3[16]) =
  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

 // Scaling values for boxes of 3x3 and 2x3
+//extern "C"
 TALIGN16(const uint16, scaleac3[8]) =
  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

 // Arrange first value for pixels 0,1,2,3,4,5
+//extern "C"
 TALIGN16(const uint8, shufab0[16]) =
  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

 // Arrange second value for pixels 0,1,2,3,4,5
+//extern "C"
 TALIGN16(const uint8, shufab1[16]) =
  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

 // Arrange third value for pixels 0,1,2,3,4,5
+//extern "C"
 TALIGN16(const uint8, shufab2[16]) =
  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

 // Scaling values for boxes of 3x2 and 2x2
+//extern "C"
 TALIGN16(const uint16, scaleab2[8]) =
  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
 #endif
@ -1620,14 +1660,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown8Int_SSE2             \n"
-"_ScaleRowDown8Int_SSE2:                       \n"
-#else
-    ".global ScaleRowDown8Int_SSE2             \n"
-"ScaleRowDown8Int_SSE2:                        \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebx                    \n"
@ -1691,14 +1724,7 @@ void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
                                     uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_SSSE3              \n"
-"_ScaleRowDown34_SSSE3:                        \n"
-#else
-    ".global ScaleRowDown34_SSSE3              \n"
-"ScaleRowDown34_SSSE3:                         \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x2c(%esp),%edi                    \n"
@ -1729,14 +1755,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_1_Int_SSSE3        \n"
-"_ScaleRowDown34_1_Int_SSSE3:                  \n"
-#else
-    ".global ScaleRowDown34_1_Int_SSSE3        \n"
-"ScaleRowDown34_1_Int_SSSE3:                   \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebp                    \n"
@ -1790,14 +1809,7 @@ void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown34_0_Int_SSSE3        \n"
-"_ScaleRowDown34_0_Int_SSSE3:                  \n"
-#else
-    ".global ScaleRowDown34_0_Int_SSSE3        \n"
-"ScaleRowDown34_0_Int_SSSE3:                   \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%ebp                    \n"
@ -1854,14 +1866,7 @@ void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                     uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_SSSE3              \n"
-"_ScaleRowDown38_SSSE3:                        \n"
-#else
-    ".global ScaleRowDown38_SSSE3              \n"
-"ScaleRowDown38_SSSE3:                         \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@ -1890,14 +1895,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_3_Int_SSSE3        \n"
-"_ScaleRowDown38_3_Int_SSSE3:                  \n"
-#else
-    ".global ScaleRowDown38_3_Int_SSSE3        \n"
-"ScaleRowDown38_3_Int_SSSE3:                   \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@ -1954,14 +1952,7 @@ void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                           uint8* dst_ptr, int dst_width);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleRowDown38_2_Int_SSSE3        \n"
-"_ScaleRowDown38_2_Int_SSSE3:                  \n"
-#else
-    ".global ScaleRowDown38_2_Int_SSSE3        \n"
-"ScaleRowDown38_2_Int_SSSE3:                   \n"
-#endif
+    DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@ -2001,14 +1992,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                                  uint16* dst_ptr, int src_width,
                                  int src_height);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleAddRows_SSE2                 \n"
-"_ScaleAddRows_SSE2:                           \n"
-#else
-    ".global ScaleAddRows_SSE2                 \n"
-"ScaleAddRows_SSE2:                            \n"
-#endif
+    DECLARE_FUNCTION(ScaleAddRows_SSE2)
    "pusha                                     \n"
    "mov    0x24(%esp),%esi                    \n"
    "mov    0x28(%esp),%edx                    \n"
@ -2052,14 +2036,7 @@ void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                     const uint8* src_ptr, int src_stride,
                                     int dst_width, int source_y_fraction);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleFilterRows_SSE2              \n"
-"_ScaleFilterRows_SSE2:                        \n"
-#else
-    ".global ScaleFilterRows_SSE2              \n"
-"ScaleFilterRows_SSE2:                         \n"
-#endif
+    DECLARE_FUNCTION(ScaleFilterRows_SSE2)
    "push   %esi                               \n"
    "push   %edi                               \n"
    "mov    0xc(%esp),%edi                     \n"
@ -2147,14 +2124,7 @@ void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                      const uint8* src_ptr, int src_stride,
                                      int dst_width, int source_y_fraction);
  asm(
-    ".text                                     \n"
-#if defined(OSX)
-    ".globl _ScaleFilterRows_SSSE3             \n"
-"_ScaleFilterRows_SSSE3:                       \n"
-#else
-    ".global ScaleFilterRows_SSSE3             \n"
-"ScaleFilterRows_SSSE3:                        \n"
-#endif
+    DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
    "push   %esi                               \n"
    "push   %edi                               \n"
    "mov    0xc(%esp),%edi                     \n"
@ -2318,7 +2288,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
                                       uint8* dst_ptr, int dst_width) {
-  asm volatile(
+  asm volatile (
  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
@ -2436,7 +2406,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
 #define HAS_SCALEROWDOWN38_SSSE3
 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
                                 uint8* dst_ptr, int dst_width) {
-  asm volatile(
+  asm volatile (
  "movdqa     (%3),%%xmm4                      \n"
  "movdqa     (%4),%%xmm5                      \n"
 "1:"
@ -2560,7 +2530,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
                              uint16* dst_ptr, int src_width,
                              int src_height) {
-  asm volatile(
+  asm volatile (
  "pxor       %%xmm5,%%xmm5                    \n"
 "1:"
  "movdqa     (%0),%%xmm2                      \n"
@ -2602,7 +2572,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                 const uint8* src_ptr, int src_stride,
                                 int dst_width, int source_y_fraction) {
  if (source_y_fraction == 0) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@ -2620,7 +2590,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    );
    return;
  } else if (source_y_fraction == 128) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@ -2640,7 +2610,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
    );
    return;
  } else {
-    asm volatile(
+    asm volatile (
      "mov        %3,%%eax                     \n"
      "movd       %%eax,%%xmm6                 \n"
      "punpcklwd  %%xmm6,%%xmm6                \n"
@ -2693,7 +2663,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
                                  const uint8* src_ptr, int src_stride,
                                  int dst_width, int source_y_fraction) {
  if (source_y_fraction == 0) {
-    asm volatile(
+    asm volatile (
   "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "lea        0x10(%1),%1                  \n"
@ -2711,7 +2681,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    );
    return;
  } else if (source_y_fraction == 128) {
-    asm volatile(
+    asm volatile (
    "1:"
      "movdqa     (%1),%%xmm0                  \n"
      "movdqa     (%1,%3,1),%%xmm2             \n"
@ -2731,7 +2701,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
    );
    return;
  } else {
-    asm volatile(
+    asm volatile (
      "mov        %3,%%eax                     \n"
      "shr        %%eax                        \n"
      "mov        %%al,%%ah                    \n"
@ -3095,10 +3065,7 @@ static void ScalePlaneDown2(int src_width, int src_height,
    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
  } else
 #endif
-/* TODO: Force to call C version all the time in ordert to get matching results
- * in multi-resolution encoder example.
- */
-#if 0 //defined(HAS_SCALEROWDOWN2_SSE2)
+#if defined(HAS_SCALEROWDOWN2_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(dst_width, 16) &&
      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
@ -3292,7 +3259,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
      src_row = 0;
    }
  }
-  }
+}
 }

 /**
@ -3369,7 +3336,7 @@ static void ScalePlaneDown38(int src_width, int src_height,
    }
    dst_ptr += dst_stride;
  }
-  }
+}
 }

 __inline static uint32 SumBox(int iboxwidth, int iboxheight,
@ -3630,7 +3597,7 @@ static void ScalePlaneBilinear(int src_width, int src_height,
      }
    }
  }
-  }
+}
 }

 /**
@ -3818,36 +3785,32 @@ int I420Scale(const uint8* src_y, int src_stride_y,
    src_stride_v = -src_stride_v;
  }
  {
-    int halfsrc_width = (src_width + 1) >> 1;
-    int halfsrc_height = (src_height + 1) >> 1;
-    int halfdst_width = (dst_width + 1) >> 1;
-    int halfoheight = (dst_height + 1) >> 1;
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;

    ScalePlane(src_y, src_stride_y, src_width, src_height,
               dst_y, dst_stride_y, dst_width, dst_height,
               filtering, use_reference_impl_);
-    ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
-               dst_u, dst_stride_u, halfdst_width, halfoheight,
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
-    ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
-               dst_v, dst_stride_v, halfdst_width, halfoheight,
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
  }
  return 0;
 }

-int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+// Deprecated api
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
          int src_stride_y, int src_stride_u, int src_stride_v,
          int src_width, int src_height,
          uint8* dst_y, uint8* dst_u, uint8* dst_v,
          int dst_stride_y, int dst_stride_u, int dst_stride_v,
          int dst_width, int dst_height,
          int interpolate) {
-  int halfsrc_width;
-  int halfsrc_height;
-  int halfdst_width;
-  int halfoheight;
-  FilterMode filtering;
  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
    return -1;
@ -3864,51 +3827,58 @@ int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v,
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
-  halfsrc_width = (src_width + 1) >> 1;
-  halfsrc_height = (src_height + 1) >> 1;
-  halfdst_width = (dst_width + 1) >> 1;
-  halfoheight = (dst_height + 1) >> 1;
-  filtering = interpolate ? kFilterBox : kFilterNone;
+  {
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;

  ScalePlane(src_y, src_stride_y, src_width, src_height,
             dst_y, dst_stride_y, dst_width, dst_height,
             filtering, use_reference_impl_);
-  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
-             dst_u, dst_stride_u, halfdst_width, halfoheight,
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
-  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
-             dst_v, dst_stride_v, halfdst_width, halfoheight,
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
             filtering, use_reference_impl_);
+  }
  return 0;
 }

-int Scale_1(const uint8* src, int src_width, int src_height,
-          uint8* dst, int dst_width, int dst_height, int ooffset,
+// Deprecated api
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
          int interpolate) {
  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
-      ooffset >= dst_height) {
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+      dst_yoffset >= dst_height) {
    return -1;
  }
-  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
+  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
  {
-    int halfsrc_width = (src_width + 1) >> 1;
-    int halfsrc_height = (src_height + 1) >> 1;
-    int halfdst_width = (dst_width + 1) >> 1;
-    int halfoheight = (dst_height + 1) >> 1;
-    int aheight = dst_height - ooffset * 2;  // actual output height
-    const uint8* const iyptr = src;
-    uint8* oyptr = dst + ooffset * dst_width;
-    const uint8* const iuptr = src + src_width * src_height;
-    uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
-    const uint8* const ivptr = src + src_width * src_height +
-                               halfsrc_width * halfsrc_height;
-    uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
-                   (ooffset >> 1) * halfdst_width;
-    return Scale_2(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
-                 src_width, src_height, oyptr, ouptr, ovptr, dst_width,
-                 halfdst_width, halfdst_width, dst_width, aheight, interpolate);
+  int src_halfwidth = (src_width + 1) >> 1;
+  int src_halfheight = (src_height + 1) >> 1;
+  int dst_halfwidth = (dst_width + 1) >> 1;
+  int dst_halfheight = (dst_height + 1) >> 1;
+  int aheight = dst_height - dst_yoffset * 2;  // actual output height
+  const uint8* const src_y = src;
+  const uint8* const src_u = src + src_width * src_height;
+  const uint8* const src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset >> 1) * dst_halfwidth;
+  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
  }
 }

-//}  // namespace libyuv
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
--- a/vp8_multi_resolution_encoder.c
+++ b/vp8_multi_resolution_encoder.c
@ -78,6 +78,8 @@ static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
    exit(EXIT_FAILURE);
 }

+int (*read_frame_p)(FILE *f, vpx_image_t *img);
+
 static int read_frame(FILE *f, vpx_image_t *img) {
    size_t nbytes, to_read;
    int    res = 1;
@ -92,6 +94,55 @@ static int read_frame(FILE *f, vpx_image_t *img) {
    return res;
 }

+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+    int plane;
+
+    for (plane = 0; plane < 3; plane++)
+    {
+        unsigned char *ptr;
+        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+        int r;
+
+        /* Determine the correct plane based on the image format. The for-loop
+         * always counts in Y,U,V order, but this may not match the order of
+         * the data on disk.
+         */
+        switch (plane)
+        {
+        case 1:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+            break;
+        case 2:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+            break;
+        default:
+            ptr = img->planes[plane];
+        }
+
+        for (r = 0; r < h; r++)
+        {
+            to_read = w;
+
+            nbytes = fread(ptr, 1, to_read, f);
+            if(nbytes != to_read) {
+                res = 0;
+                if(nbytes > 0)
+                    printf("Warning: Read partial frame. Check your width & height!\n");
+                break;
+            }
+
+            ptr += img->stride[plane];
+        }
+        if (!res)
+            break;
+    }
+
+    return res;
+}
+
 static void write_ivf_file_header(FILE *outfile,
                                  const vpx_codec_enc_cfg_t *cfg,
                                  int frame_cnt) {
@ -262,9 +313,14 @@ int main(int argc, char **argv)

    /* Allocate image for each encoder */
    for (i=0; i< NUM_ENCODERS; i++)
-        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 1))
+        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);

+    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+        read_frame_p = read_frame;
+    else
+        read_frame_p = read_frame_by_row;
+
    for (i=0; i< NUM_ENCODERS; i++)
        write_ivf_file_header(outfile[i], &cfg[i], 0);

@ -305,35 +361,22 @@ int main(int argc, char **argv)
        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];

        flags = 0;
-        frame_avail = read_frame(infile, &raw[0]);
+        frame_avail = read_frame_p(infile, &raw[0]);

-        for ( i=1; i<NUM_ENCODERS; i++)
-        {
        if(frame_avail)
+        {
+            for ( i=1; i<NUM_ENCODERS; i++)
            {
                /*Scale the image down a number of times by downsampling factor*/
-                int src_uvwidth = (raw[i-1].d_w + 1) >> 1;
-                int src_uvheight = (raw[i-1].d_h + 1) >> 1;
-                const unsigned char* src_y = raw[i-1].planes[VPX_PLANE_Y];
-                const unsigned char* src_u = raw[i-1].planes[VPX_PLANE_Y]
-                                             + raw[i-1].d_w*raw[i-1].d_h;
-                const unsigned char* src_v = raw[i-1].planes[VPX_PLANE_Y]
-                                             + raw[i-1].d_w*raw[i-1].d_h
-                                             + src_uvwidth*src_uvheight;
-                int dst_uvwidth = (raw[i].d_w + 1) >> 1;
-                int dst_uvheight = (raw[i].d_h + 1) >> 1;
-                unsigned char* dst_y = raw[i].planes[VPX_PLANE_Y];
-                unsigned char* dst_u = raw[i].planes[VPX_PLANE_Y]
-                                       + raw[i].d_w*raw[i].d_h;
-                unsigned char* dst_v = raw[i].planes[VPX_PLANE_Y]
-                                       + raw[i].d_w*raw[i].d_h
-                                       + dst_uvwidth*dst_uvheight;
-
                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
-                I420Scale(src_y, raw[i-1].d_w, src_u, src_uvwidth, src_v,
-                          src_uvwidth, raw[i-1].d_w, raw[i-1].d_h,
-                          dst_y, raw[i].d_w, dst_u, dst_uvwidth,
-                          dst_v, dst_uvwidth, raw[i].d_w, raw[i].d_h, 1);
+                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+                          raw[i-1].d_w, raw[i-1].d_h,
+                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+                          raw[i].d_w, raw[i].d_h, 1);
            }
        }

--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@ -243,6 +243,7 @@ vpx_codec_err_t  vpx_codec_encode(vpx_codec_ctx_t            *ctx,
                ctx--;
                if (img) img--;
            }
+            ctx++;
        }

        FLOATING_POINT_RESTORE();