From aa7335e610b961626f77130bc99b24de1031601d Mon Sep 17 00:00:00 2001 From: Yunqing Wang <yunqingwang@google.com> Date: Tue, 25 Oct 2011 15:14:16 -0400 Subject: [PATCH] Multiple-resolution encoder The example encoder down-samples the input video frames a number of times with a down-sampling factor, and then encodes and outputs bitstreams with different resolutions. Support arbitrary down-sampling factor, and down-sampling factor can be different for each encoding level. For example, the encoder can be tested as follows. 1. Configure with multi-resolution encoding enabled: ../libvpx/configure --target=x86-linux-gcc --disable-codecs --enable-vp8 --enable-runtime_cpu_detect --enable-debug --disable-install-docs --enable-error-concealment --enable-multi-res-encoding 2. Run make 3. Encode: If input video is 1280x720, run: ./vp8_multi_resolution_encoder 1280 720 input.yuv 1.ivf 2.ivf 3.ivf 1 (output: 1.ivf(1280x720); 2.ivf(640x360); 3.ivf(320x180). The last parameter is set to 1/0 to show/not show PSNR.) 4. Decode: ./simple_decoder 1.ivf 1.yuv ./simple_decoder 2.ivf 2.yuv ./simple_decoder 3.ivf 3.yuv 5. View video: mplayer 1.yuv -demuxer rawvideo -rawvideo w=1280:h=720 -loop 0 -fps 30 mplayer 2.yuv -demuxer rawvideo -rawvideo w=640:h=360 -loop 0 -fps 30 mplayer 3.yuv -demuxer rawvideo -rawvideo w=320:h=180 -loop 0 -fps 30 The encoding parameters can be modified in vp8_multi_resolution_encoder.c, for example, target bitrate, frame rate... Modified API. John helped a lot with that. Thanks! Change-Id: I03be9a51167eddf94399f92d269599fb3f3d54f5 --- configure | 5 +- examples.mk | 10 + third_party/libyuv/README.webm | 17 + .../libyuv/include/libyuv/basic_types.h | 68 + third_party/libyuv/include/libyuv/cpu_id.h | 31 + third_party/libyuv/include/libyuv/scale.h | 67 + third_party/libyuv/source/cpu_id.c | 74 + third_party/libyuv/source/row.h | 258 ++ third_party/libyuv/source/scale.c | 3914 +++++++++++++++++ usage.dox | 1 + usage_cx.dox | 3 +- vp8/common/blockd.h | 12 + vp8/common/onyx.h | 14 + vp8/encoder/encodeframe.c | 30 +- vp8/encoder/mcomp.c | 24 +- vp8/encoder/mr_dissim.c | 201 + vp8/encoder/mr_dissim.h | 19 + vp8/encoder/onyx_if.c | 29 +- vp8/encoder/onyx_int.h | 8 + vp8/encoder/pickinter.c | 573 +++ vp8/encoder/pickinter.h | 12 +- vp8/encoder/rdopt.c | 51 - vp8/encoder/rdopt.h | 51 + vp8/vp8_cx_iface.c | 60 +- vp8/vp8_dx_iface.c | 6 +- vp8/vp8cx.mk | 2 + vp8_multi_resolution_encoder.c | 420 ++ vpx/internal/vpx_codec_internal.h | 25 +- vpx/src/vpx_decoder.c | 2 +- vpx/src/vpx_encoder.c | 114 +- vpx/vpx_encoder.h | 44 +- 31 files changed, 6057 insertions(+), 88 deletions(-) create mode 100644 third_party/libyuv/README.webm create mode 100644 third_party/libyuv/include/libyuv/basic_types.h create mode 100644 third_party/libyuv/include/libyuv/cpu_id.h create mode 100644 third_party/libyuv/include/libyuv/scale.h create mode 100644 third_party/libyuv/source/cpu_id.c create mode 100644 third_party/libyuv/source/row.h create mode 100644 third_party/libyuv/source/scale.c create mode 100644 vp8/encoder/mr_dissim.c create mode 100644 vp8/encoder/mr_dissim.h create mode 100644 vp8_multi_resolution_encoder.c diff --git a/configure b/configure index cca94a24c..363687a11 100755 --- a/configure +++ b/configure @@ -35,7 +35,7 @@ Advanced options: ${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders) ${toggle_mem_tracker} track memory usage ${toggle_postproc} postprocessing - ${toggle_multithread} multithreaded encoding and decoding. + ${toggle_multithread} multithreaded encoding and decoding ${toggle_spatial_resampling} spatial sampling (scaling) support ${toggle_realtime_only} enable this option while building for real-time encoding ${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses @@ -44,6 +44,7 @@ Advanced options: ${toggle_static} static library support ${toggle_small} favor smaller size over speed ${toggle_postproc_visualizer} macro block / block level visualizers + ${toggle_multi_res_encoding} enable multiple-resolution encoding Codecs: Codecs can be selectively enabled or disabled individually, or by family: @@ -262,6 +263,7 @@ CONFIG_LIST=" postproc_visualizer os_support unit_tests + multi_res_encoding " CMDLINE_SELECT=" extra_warnings @@ -304,6 +306,7 @@ CMDLINE_SELECT=" small postproc_visualizer unit_tests + multi_res_encoding " process_cmdline() { diff --git a/examples.mk b/examples.mk index 8088d3217..1f7dcc171 100644 --- a/examples.mk +++ b/examples.mk @@ -96,6 +96,16 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame +# C file is provided, not generated automatically. +GEN_EXAMPLES-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS \ + += third_party/libyuv/include/libyuv/basic_types.h \ + third_party/libyuv/include/libyuv/cpu_id.h \ + third_party/libyuv/include/libyuv/scale.h \ + third_party/libyuv/source/scale.c \ + third_party/libyuv/source/cpu_id.c +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding # Handle extra library flags depending on codec configuration diff --git a/third_party/libyuv/README.webm b/third_party/libyuv/README.webm new file mode 100644 index 000000000..32766be27 --- /dev/null +++ b/third_party/libyuv/README.webm @@ -0,0 +1,17 @@ +Name: libyuv +URL: http://code.google.com/p/libyuv/ +Version: 90 +License: BSD +License File: LICENSE + +Description: +libyuv is an open source project that includes YUV conversion and scaling +functionality. + +The optimized scaler in libyuv is used in multiple resolution encoder example, +which down-samples the original input video (f.g. 1280x720) a number of times +in order to encode multiple resolution bit streams. + +Local Modifications: +Modified the original scaler code from C++ to C to fit in our current build +system. This is a temporal solution, and will be improved later. \ No newline at end of file diff --git a/third_party/libyuv/include/libyuv/basic_types.h b/third_party/libyuv/include/libyuv/basic_types.h new file mode 100644 index 000000000..87f8bd2de --- /dev/null +++ b/third_party/libyuv/include/libyuv/basic_types.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ +#define INCLUDE_LIBYUV_BASIC_TYPES_H_ + +#include <stddef.h> // for NULL, size_t + +#ifndef WIN32 +#include <stdint.h> // for uintptr_t +#endif + +#ifndef INT_TYPES_DEFINED +#define INT_TYPES_DEFINED +#ifdef COMPILER_MSVC +typedef __int64 int64; +#else +typedef long long int64; +#endif /* COMPILER_MSVC */ +typedef int int32; +typedef short int16; +typedef char int8; + +#ifdef COMPILER_MSVC +typedef unsigned __int64 uint64; +typedef __int64 int64; +#ifndef INT64_C +#define INT64_C(x) x ## I64 +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UI64 +#endif +#define INT64_F "I64" +#else +typedef unsigned long long uint64; +//typedef long long int64; +#ifndef INT64_C +#define INT64_C(x) x ## LL +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## ULL +#endif +#define INT64_F "ll" +#endif /* COMPILER_MSVC */ +typedef unsigned int uint32; +typedef unsigned short uint16; +typedef unsigned char uint8; +#endif // INT_TYPES_DEFINED + +// Detect compiler is for x86 or x64. +#if defined(__x86_64__) || defined(_M_X64) || \ + defined(__i386__) || defined(_M_IX86) +#define CPU_X86 1 +#endif + +#define IS_ALIGNED(p, a) (0==((uintptr_t)(p) & ((a)-1))) +#define ALIGNP(p, t) \ + ((uint8*)((((uintptr_t)(p) + \ + ((t)-1)) & ~((t)-1)))) + +#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h new file mode 100644 index 000000000..8ebafe9b5 --- /dev/null +++ b/third_party/libyuv/include/libyuv/cpu_id.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CPU_ID_H_ +#define INCLUDE_LIBYUV_CPU_ID_H_ + +//namespace libyuv { + +// These flags are only valid on x86 processors +static const int kCpuHasSSE2 = 1; +static const int kCpuHasSSSE3 = 2; + +// SIMD support on ARM processors +static const int kCpuHasNEON = 4; + +// Detect CPU has SSE2 etc. +int TestCpuFlag(int flag); + +// For testing, allow CPU flags to be disabled. +void MaskCpuFlagsForTest(int enable_flags); + +//} // namespace libyuv + +#endif // INCLUDE_LIBYUV_CPU_ID_H_ diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h new file mode 100644 index 000000000..5b2d364ad --- /dev/null +++ b/third_party/libyuv/include/libyuv/scale.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_H_ +#define INCLUDE_LIBYUV_SCALE_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +//namespace libyuv { + +// Supported filtering +typedef enum { + kFilterNone = 0, // Point sample; Fastest + kFilterBilinear = 1, // Faster than box, but lower quality scaling down. + kFilterBox = 2 // Highest quality +}FilterMode; + +// Scales a YUV 4:2:0 image from the src width and height to the +// dst width and height. +// If filtering is kFilterNone, a simple nearest-neighbor algorithm is +// used. This produces basic (blocky) quality at the fastest speed. +// If filtering is kFilterBilinear, interpolation is used to produce a better +// quality image, at the expense of speed. +// If filtering is kFilterBox, averaging is used to produce ever better +// quality image, at further expense of speed. +// Returns 0 if successful. + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering); + +// Legacy API +// If dst_height_offset is non-zero, the image is offset by that many pixels +// and stretched to (dst_height - dst_height_offset * 2) pixels high, +// instead of dst_height. +int Scale_1(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int dst_height_offset, + int interpolate); + +// Same, but specified src terms of each plane location and stride. +int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate); + +// For testing, allow disabling of optimizations. +void SetUseReferenceImpl(int use); + +//} // namespace libyuv + +#endif // INCLUDE_LIBYUV_SCALE_H_ diff --git a/third_party/libyuv/source/cpu_id.c b/third_party/libyuv/source/cpu_id.c new file mode 100644 index 000000000..e3b66f21d --- /dev/null +++ b/third_party/libyuv/source/cpu_id.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86 + +#ifdef _MSC_VER +#include <intrin.h> +#endif + +// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. +#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void __cpuid(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type) + ); +} +#endif + +//namespace libyuv { + +// CPU detect function for SIMD instruction sets. +static int cpu_info_initialized_ = 0; +static int cpu_info_ = 0; + +// Global lock for cpu initialization. +static void InitCpuFlags() { +#ifdef CPU_X86 + int cpu_info[4]; + __cpuid(cpu_info, 1); + cpu_info_ = (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | + (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0); +#elif defined(__ARM_NEON__) + // gcc -mfpu=neon defines __ARM_NEON__ + // if code is specifically built for Neon-only, enable the flag. + cpu_info_ |= kCpuHasNEON; +#else + cpu_info_ = 0; +#endif + cpu_info_initialized_ = 1; +} + +void MaskCpuFlagsForTest(int enable_flags) { + InitCpuFlags(); + cpu_info_ &= enable_flags; +} + +int TestCpuFlag(int flag) { + if (!cpu_info_initialized_) { + InitCpuFlags(); + } + return cpu_info_ & flag ? 1 : 0; +} + +//} // namespace libyuv diff --git a/third_party/libyuv/source/row.h b/third_party/libyuv/source/row.h new file mode 100644 index 000000000..0486fe23a --- /dev/null +++ b/third_party/libyuv/source/row.h @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LIBYUV_SOURCE_ROW_H_ +#define LIBYUV_SOURCE_ROW_H_ + +#include "third_party/libyuv/include/libyuv/basic_types.h" + +#define kMaxStride (2048 * 4) +//#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) + +#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_FASTCONVERTYUVTOARGBROW_NEON +void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOBGRAROW_NEON +void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#define HAS_FASTCONVERTYUVTOABGRROW_NEON +void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#endif + +// The following are available on all x86 platforms +#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(YUV_DISABLE_ASM) +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_BGRATOARGBROW_SSSE3 +#define HAS_BG24TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOUVROW_SSSE3 +#define HAS_RAWTOUVROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_FASTCONVERTYTOARGBROW_SSE2 +#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 +#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 +#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_REVERSE_ROW_SSSE3 +#endif + +// The following are available on Neon platforms +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_REVERSE_ROW_NEON +#endif + +//extern "C" { + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) +#define HASRGB24TOYROW_SSSE3 +#endif +#ifdef HASRGB24TOYROW_SSSE3 +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#ifdef HAS_REVERSE_ROW_SSSE3 +void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); +#endif +#ifdef HAS_REVERSE_ROW_NEON +void ReverseRow_NEON(const uint8* src, uint8* dst, int width); +#endif +void ReverseRow_C(const uint8* src, uint8* dst, int width); + +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +#endif +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); + +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +#endif +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) signed char vec8[16]; +typedef __declspec(align(16)) unsigned char uvec8[16]; +typedef __declspec(align(16)) signed short vec16[8]; +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef signed char __attribute__((vector_size(16))) vec8; +typedef unsigned char __attribute__((vector_size(16))) uvec8; +typedef signed short __attribute__((vector_size(16))) vec16; +#endif + +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +//extern "C" +SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); + +void FastConvertYUVToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 +void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); +#endif + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#endif + +//} // extern "C" + +#endif // LIBYUV_SOURCE_ROW_H_ diff --git a/third_party/libyuv/source/scale.c b/third_party/libyuv/source/scale.c new file mode 100644 index 000000000..02ffdac65 --- /dev/null +++ b/third_party/libyuv/source/scale.c @@ -0,0 +1,3914 @@ +/* + * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/libyuv/include/libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "third_party/libyuv/include/libyuv/cpu_id.h" +#include "third_party/libyuv/source/row.h" +#if defined(_MSC_VER) +#define ALIGN16(var) __declspec(align(16)) var +#else +#define ALIGN16(var) var __attribute__((aligned(16))) +#endif + +// Note: A Neon reference manual +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html +// Note: Some SSE2 reference manuals +// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf + +//namespace libyuv { + +// Set the following flag to true to revert to only +// using the reference implementation ScalePlaneBox(), and +// NOT the optimized versions. Useful for debugging and +// when comparing the quality of the resulting YUV planes +// as produced by the optimized and non-optimized versions. + +static int use_reference_impl_ = 0; + +void SetUseReferenceImpl(int use) { + use_reference_impl_ = use; +} + +// TODO: The preprocessor definitions for Win64 are not right in build system. +// Disable optimized code for now. +#define YUV_DISABLE_ASM + +/** + * NEON downscalers with interpolation. + * + * Provided by Fritz Koenig + * + */ + +#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#define HAS_SCALEROWDOWN2_NEON +void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + +void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "add %1, %0 \n" // change the stride to row 2 pointer + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "q0", "q1", "q2", "q3" // Clobber List + ); +} + +#define HAS_SCALEROWDOWN4_NEON +static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1", "memory", "cc" + ); +} + +static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" + + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" + + "vpaddl.u16 q0, q0 \n" + + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding + + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" + + "subs %2, #4 \n" + "bhi 1b \n" + + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(src_stride) // %3 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order needs to be d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + ); +} + +static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + ); +} + +#define HAS_SCALEROWDOWN38_NEON +const uint8 shuf38[16] __attribute__ ((aligned(16))) = + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; +const uint8 shuf38_2[16] __attribute__ ((aligned(16))) = + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; +const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) = + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; +const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; + +// 32 -> 12 +static void ScaleRowDown38_NEON(const uint8* src_ptr, int, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(shuf38) // %3 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + ); +} + +// 32x3 -> 12x1 +static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" + + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "vqrdmulh.s16 q2, q13 \n" + "vmovn.u16 d4, q2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q15 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2), // %5 + "r"(mult38_div9) // %6 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", + "q13", "q14", "q15", "memory", "cc" + ); +} + +// 32x2 -> 12x1 +static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" + + // d0 = 00 40 01 41 02 42 03 43 + // d1 = 10 50 11 51 12 52 13 53 + // d2 = 20 60 21 61 22 62 23 63 + // d3 = 30 70 31 71 32 72 33 73 + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // d0 = 00 10 01 11 02 12 03 13 + // d1 = 40 50 41 51 42 52 43 53 + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + + // d2 = 20 30 21 31 22 32 23 33 + // d3 = 60 70 61 71 62 72 63 73 + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + + // d0 = 00+10 01+11 02+12 03+13 + // d2 = 40+50 41+51 42+52 43+53 + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + + // d3 = 60+70 61+71 62+72 63+73 + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + + // combine source lines + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" + + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "vqrshrn.u16 d4, q2, #2 \n" + + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + + // combine source lines + "vadd.u16 q1, q3 \n" + + // d4 = xx 20 xx 30 xx 22 xx 32 + // d5 = xx 21 xx 31 xx 23 xx 33 + "vtrn.u32 d2, d3 \n" + + // d4 = xx 20 xx 21 xx 22 xx 23 + // d5 = xx 30 xx 31 xx 32 xx 33 + "vtrn.u16 d2, d3 \n" + + // 0+1+2, 3+4+5 + "vadd.u16 q0, q1 \n" + + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "vqrdmulh.s16 q0, q13 \n" + + // Align for table lookup, vtbl requires registers to + // be adjacent + "vmov.u8 d2, d4 \n" + + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : "r"(mult38_div6), // %4 + "r"(shuf38_2) // %5 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + ); +} + +/** + * SSE2 downscalers with interpolation. + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ + +// Constants for SSE2 code +#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \ + !defined(YUV_DISABLE_ASM) +#if defined(_MSC_VER) +#define TALIGN16(t, var) __declspec(align(16)) t _ ## var +#elif defined(OSX) && defined(__i386__) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +// Offsets for source bytes 0 to 9 +TALIGN16(const uint8, shuf0[16]) = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +TALIGN16(const uint8, shuf1[16]) = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +TALIGN16(const uint8, shuf2[16]) = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +TALIGN16(const uint8, shuf01[16]) = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +TALIGN16(const uint8, shuf11[16]) = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +TALIGN16(const uint8, shuf21[16]) = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +TALIGN16(const uint8, madd01[16]) = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +TALIGN16(const uint8, madd11[16]) = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +TALIGN16(const uint8, madd21[16]) = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +TALIGN16(const int16, round34[8]) = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +TALIGN16(const uint8, shuf38a[16]) = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +TALIGN16(const uint8, shuf38b[16]) = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +TALIGN16(const uint8, shufac0[16]) = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +TALIGN16(const uint8, shufac3[16]) = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +TALIGN16(const uint16, scaleac3[8]) = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab0[16]) = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab1[16]) = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +TALIGN16(const uint8, shufab2[16]) = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +TALIGN16(const uint16, scaleab2[8]) = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; +#endif + +#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) + +#define HAS_SCALEROWDOWN2_SSE2 +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + + pop esi + ret + } +} + +#define HAS_SCALEROWDOWN4_SSE2 +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + lea edx, [ebx + ebx * 2] // src_stride * 3 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 8 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN8_SSE2 +// Point samples 32 pixels to 4 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes + psrlq xmm5, 56 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 // 32->16 + packuswb xmm0, xmm0 // 16->8 + packuswb xmm0, xmm0 // 8->4 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +// Blends 32x8 rectangle to 4x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. +__declspec(naked) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + lea edx, [ebx + ebx * 2] // src_stride * 3 + pxor xmm7, xmm7 + + wloop: + movdqa xmm0, [esi] // average 8 rows to 1 + movdqa xmm1, [esi + 16] + movdqa xmm2, [esi + ebx] + movdqa xmm3, [esi + ebx + 16] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + movdqa xmm2, [esi + ebx * 2] + movdqa xmm3, [esi + ebx * 2 + 16] + movdqa xmm4, [esi + edx] + movdqa xmm5, [esi + edx + 16] + lea ebp, [esi + ebx * 4] + lea esi, [esi + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, [ebp] + movdqa xmm3, [ebp + 16] + movdqa xmm4, [ebp + ebx] + movdqa xmm5, [ebp + ebx + 16] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + movdqa xmm4, [ebp + ebx * 2] + movdqa xmm5, [ebp + ebx * 2 + 16] + movdqa xmm6, [ebp + edx] + pavgb xmm4, xmm6 + movdqa xmm6, [ebp + edx + 16] + pavgb xmm5, xmm6 + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + psadbw xmm0, xmm7 // average 32 pixels to 4 + psadbw xmm1, xmm7 + pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 + pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx + por xmm0, xmm1 // -> 3201 + psrlw xmm0, 3 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + + lea edi, [edi + 4] + sub ecx, 4 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN34_SSSE3 +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + // src_stride ignored + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm3, _shuf0 + movdqa xmm4, _shuf1 + movdqa xmm5, _shuf2 + + wloop: + movdqa xmm0, [esi] + movdqa xmm1, [esi + 16] + lea esi, [esi + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edi], xmm0 + movq qword ptr [edi + 8], xmm1 + movq qword ptr [edi + 16], xmm2 + lea edi, [edi + 24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 round34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov ebx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _round34 + + wloop: + movdqa xmm0, [esi] // pixels 0..7 + movdqa xmm1, [esi+ebx] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi], xmm0 + movdqu xmm0, [esi+8] // pixels 8..15 + movdqu xmm1, [esi+ebx+8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+8], xmm0 + movdqa xmm0, [esi+16] // pixels 16..23 + movdqa xmm1, [esi+ebx+16] + lea esi, [esi+32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, _madd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edi+16], xmm0 + lea edi, [edi+24] + sub ecx, 24 + ja wloop + + popad + ret + } +} + +#define HAS_SCALEROWDOWN38_SSSE3 +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shuf38a + movdqa xmm5, _shuf38b + + xloop: + movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 + lea esi, [esi + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + movq qword ptr [edi], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edi + 8], xmm1 + lea edi, [edi + 12] + sub ecx, 12 + ja xloop + + popad + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufac0 + movdqa xmm5, _shufac3 + movdqa xmm6, _scaleac3 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 + movdqa xmm2, [esi + edx] + movhlps xmm1, xmm0 + movhlps xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm1, xmm7 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + movdqa xmm2, [esi + edx * 2] + lea esi, [esi + 16] + movhlps xmm3, xmm2 + punpcklbw xmm2, xmm7 + punpcklbw xmm3, xmm7 + paddusw xmm0, xmm2 + paddusw xmm1, xmm3 + + movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + psrldq xmm0, 2 + paddusw xmm2, xmm0 + pshufb xmm2, xmm4 + + movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + psrldq xmm1, 2 + paddusw xmm3, xmm1 + pshufb xmm3, xmm5 + paddusw xmm2, xmm3 + + pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 + packuswb xmm2, xmm2 + + movd [edi], xmm2 // write 6 pixels + pextrw eax, xmm2, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + movdqa xmm4, _shufab0 + movdqa xmm5, _shufab1 + movdqa xmm6, _shufab2 + movdqa xmm7, _scaleab2 + + xloop: + movdqa xmm2, [esi] // average 2 rows into xmm2 + pavgb xmm2, [esi + edx] + lea esi, [esi + 16] + + movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 + pshufb xmm0, xmm4 + movdqa xmm1, xmm2 + pshufb xmm1, xmm5 + paddusw xmm0, xmm1 + pshufb xmm2, xmm6 + paddusw xmm0, xmm2 + + pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 + packuswb xmm0, xmm0 + + movd [edi], xmm0 // write 6 pixels + pextrw eax, xmm0, 2 + mov [edi + 4], ax + lea edi, [edi + 6] + sub ecx, 6 + ja xloop + + popad + ret + } +} + +#define HAS_SCALEADDROWS_SSE2 + +// Reads 8xN bytes and produces 16 shorts at a time. +__declspec(naked) +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + pushad + mov esi, [esp + 32 + 4] // src_ptr + mov edx, [esp + 32 + 8] // src_stride + mov edi, [esp + 32 + 12] // dst_ptr + mov ecx, [esp + 32 + 16] // dst_width + mov ebx, [esp + 32 + 20] // height + pxor xmm5, xmm5 + dec ebx + + xloop: + // first row + movdqa xmm2, [esi] + lea eax, [esi + edx] + movhlps xmm3, xmm2 + mov ebp, ebx + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + // sum remaining rows + yloop: + movdqa xmm0, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movhlps xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + paddusw xmm2, xmm0 // sum 16 words + paddusw xmm3, xmm1 + sub ebp, 1 + ja yloop + + movdqa [edi], xmm2 + movdqa [edi + 16], xmm3 + lea edi, [edi + 32] + lea esi, [esi + 16] + + sub ecx, 16 + ja xloop + + popad + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +#define HAS_SCALEFILTERROWS_SSE2 +__declspec(naked) +static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + movd xmm6, eax // xmm6 = y fraction + punpcklwd xmm6, xmm6 + pshufd xmm6, xmm6, 0 + neg eax // xmm5 = 256 - y fraction + add eax, 256 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + pxor xmm7, xmm7 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm7 + punpcklbw xmm2, xmm7 + punpckhbw xmm1, xmm7 + punpckhbw xmm3, xmm7 + pmullw xmm0, xmm5 // scale row 0 + pmullw xmm1, xmm5 + pmullw xmm2, xmm6 // scale row 1 + pmullw xmm3, xmm6 + paddusw xmm0, xmm2 // sum rows + paddusw xmm1, xmm3 + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + } +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. +#define HAS_SCALEFILTERROWS_SSSE3 +__declspec(naked) +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + cmp eax, 0 + je xloop1 + cmp eax, 128 + je xloop2 + + shr eax, 1 + mov ah,al + neg al + add al, 128 + movd xmm5, eax + punpcklwd xmm5, xmm5 + pshufd xmm5, xmm5, 0 + + xloop: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddubsw xmm0, xmm5 + pmaddubsw xmm1, xmm5 + psrlw xmm0, 7 + psrlw xmm1, 7 + packuswb xmm0, xmm1 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop1: + movdqa xmm0, [esi] + lea esi, [esi + 16] + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop1 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + xloop2: + movdqa xmm0, [esi] + movdqa xmm2, [esi + edx] + lea esi, [esi + 16] + pavgb xmm0, xmm2 + movdqa [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 16 + ja xloop2 + + mov al, [edi - 1] + mov [edi], al + pop edi + pop esi + ret + + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) +static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + movdqa xmm1, _round34 + movdqa xmm2, _shuf01 + movdqa xmm3, _shuf11 + movdqa xmm4, _shuf21 + movdqa xmm5, _madd01 + movdqa xmm6, _madd11 + movdqa xmm7, _madd21 + + wloop: + movdqa xmm0, [eax] // pixels 0..7 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax+8] // pixels 8..15 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+8], xmm0 + movdqa xmm0, [eax+16] // pixels 16..23 + lea eax, [eax+32] + pshufb xmm0, xmm4 + pmaddubsw xmm0, xmm7 + paddsw xmm0, xmm1 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx+16], xmm0 + lea edx, [edx+24] + sub ecx, 24 + ja wloop + ret + } +} + +#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt +#define HAS_SCALEROWDOWN2_SSE2 +static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc" +); +} + +#define HAS_SCALEROWDOWN4_SSE2 +static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t temp = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea (%4,%4,2),%3 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%4,2),%%xmm2 \n" + "movdqa 0x10(%0,%4,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(temp) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc" +#if defined(__x86_64__) + , "xmm6", "xmm7" +#endif +); +} + +#define HAS_SCALEROWDOWN8_SSE2 +static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +#if defined(__i386__) +void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown8Int_SSE2 \n" +"_ScaleRowDown8Int_SSE2: \n" +#else + ".global ScaleRowDown8Int_SSE2 \n" +"ScaleRowDown8Int_SSE2: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "lea (%ebx,%ebx,2),%edx \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "movdqa (%esi,%ebx,1),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa (%esi,%ebx,2),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" + "movdqa (%esi,%edx,1),%xmm4 \n" + "movdqa 0x10(%esi,%edx,1),%xmm5 \n" + "lea (%esi,%ebx,4),%ebp \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa 0x0(%ebp),%xmm2 \n" + "movdqa 0x10(%ebp),%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" + "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm4 \n" + "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "psadbw %xmm7,%xmm0 \n" + "psadbw %xmm7,%xmm1 \n" + "pshufd $0xd8,%xmm0,%xmm0 \n" + "pshufd $0x8d,%xmm1,%xmm1 \n" + "por %xmm1,%xmm0 \n" + "psrlw $0x3,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "lea 0x4(%edi),%edi \n" + "sub $0x4,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// fpic is used for magiccam plugin +#if !defined(__PIC__) +#define HAS_SCALEROWDOWN34_SSSE3 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_SSSE3 \n" +"_ScaleRowDown34_SSSE3: \n" +#else + ".global ScaleRowDown34_SSSE3 \n" +"ScaleRowDown34_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf0,%xmm3 \n" + "movdqa _shuf1,%xmm4 \n" + "movdqa _shuf2,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm2 \n" + "lea 0x20(%esi),%esi \n" + "movdqa %xmm2,%xmm1 \n" + "palignr $0x8,%xmm0,%xmm1 \n" + "pshufb %xmm3,%xmm0 \n" + "pshufb %xmm4,%xmm1 \n" + "pshufb %xmm5,%xmm2 \n" + "movq %xmm0,(%edi) \n" + "movq %xmm1,0x8(%edi) \n" + "movq %xmm2,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_1_Int_SSSE3 \n" +"_ScaleRowDown34_1_Int_SSSE3: \n" +#else + ".global ScaleRowDown34_1_Int_SSSE3 \n" +"ScaleRowDown34_1_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + + "popa \n" + "ret \n" +); + +void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown34_0_Int_SSSE3 \n" +"_ScaleRowDown34_0_Int_SSSE3: \n" +#else + ".global ScaleRowDown34_0_Int_SSSE3 \n" +"ScaleRowDown34_0_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +#define HAS_SCALEROWDOWN38_SSSE3 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_SSSE3 \n" +"_ScaleRowDown38_SSSE3: \n" +#else + ".global ScaleRowDown38_SSSE3 \n" +"ScaleRowDown38_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf38a ,%xmm4 \n" + "movdqa _shuf38b ,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pshufb %xmm4,%xmm0 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusb %xmm1,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movhlps %xmm0,%xmm1 \n" + "movd %xmm1,0x8(%edi) \n" + "lea 0xc(%edi),%edi \n" + "sub $0xc,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_3_Int_SSSE3 \n" +"_ScaleRowDown38_3_Int_SSSE3: \n" +#else + ".global ScaleRowDown38_3_Int_SSSE3 \n" +"ScaleRowDown38_3_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufac0,%xmm4 \n" + "movdqa _shufac3,%xmm5 \n" + "movdqa _scaleac3,%xmm6 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "movhlps %xmm0,%xmm1 \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm1 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa (%esi,%edx,2),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "pshufb %xmm4,%xmm2 \n" + "movdqa %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "pshufb %xmm5,%xmm3 \n" + "paddusw %xmm3,%xmm2 \n" + "pmulhuw %xmm6,%xmm2 \n" + "packuswb %xmm2,%xmm2 \n" + "movd %xmm2,(%edi) \n" + "pextrw $0x2,%xmm2,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleRowDown38_2_Int_SSSE3 \n" +"_ScaleRowDown38_2_Int_SSSE3: \n" +#else + ".global ScaleRowDown38_2_Int_SSSE3 \n" +"ScaleRowDown38_2_Int_SSSE3: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufab0,%xmm4 \n" + "movdqa _shufab1,%xmm5 \n" + "movdqa _shufab2,%xmm6 \n" + "movdqa _scaleab2,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "pavgb (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm2,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa %xmm2,%xmm1 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusw %xmm1,%xmm0 \n" + "pshufb %xmm6,%xmm2 \n" + "paddusw %xmm2,%xmm0 \n" + "pmulhuw %xmm7,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "pextrw $0x2,%xmm0,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); +#endif // __PIC__ + +#define HAS_SCALEADDROWS_SSE2 +void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleAddRows_SSE2 \n" +"_ScaleAddRows_SSE2: \n" +#else + ".global ScaleAddRows_SSE2 \n" +"ScaleAddRows_SSE2: \n" +#endif + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "mov 0x34(%esp),%ebx \n" + "pxor %xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm2 \n" + "lea (%esi,%edx,1),%eax \n" + "movhlps %xmm2,%xmm3 \n" + "lea -0x1(%ebx),%ebp \n" + "punpcklbw %xmm5,%xmm2 \n" + "punpcklbw %xmm5,%xmm3 \n" + +"2:" + "movdqa (%eax),%xmm0 \n" + "lea (%eax,%edx,1),%eax \n" + "movhlps %xmm0,%xmm1 \n" + "punpcklbw %xmm5,%xmm0 \n" + "punpcklbw %xmm5,%xmm1 \n" + "paddusw %xmm0,%xmm2 \n" + "paddusw %xmm1,%xmm3 \n" + "sub $0x1,%ebp \n" + "ja 2b \n" + + "movdqa %xmm2,(%edi) \n" + "movdqa %xmm3,0x10(%edi) \n" + "lea 0x20(%edi),%edi \n" + "lea 0x10(%esi),%esi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSE2 \n" +"_ScaleFilterRows_SSE2: \n" +#else + ".global ScaleFilterRows_SSE2 \n" +"ScaleFilterRows_SSE2: \n" +#endif + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "movd %eax,%xmm6 \n" + "punpcklwd %xmm6,%xmm6 \n" + "pshufd $0x0,%xmm6,%xmm6 \n" + "neg %eax \n" + "add $0x100,%eax \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + "pxor %xmm7,%xmm7 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "movdqa %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpckhbw %xmm7,%xmm1 \n" + "punpckhbw %xmm7,%xmm3 \n" + "pmullw %xmm5,%xmm0 \n" + "pmullw %xmm5,%xmm1 \n" + "pmullw %xmm6,%xmm2 \n" + "pmullw %xmm6,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "psrlw $0x8,%xmm0 \n" + "psrlw $0x8,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction); + asm( + ".text \n" +#if defined(OSX) + ".globl _ScaleFilterRows_SSSE3 \n" +"_ScaleFilterRows_SSSE3: \n" +#else + ".global ScaleFilterRows_SSSE3 \n" +"ScaleFilterRows_SSSE3: \n" +#endif + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "shr %eax \n" + "mov %al,%ah \n" + "neg %al \n" + "add $0x80,%al \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + +"1:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "punpcklbw %xmm2,%xmm0 \n" + "punpckhbw %xmm2,%xmm1 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "pmaddubsw %xmm5,%xmm1 \n" + "psrlw $0x7,%xmm0 \n" + "psrlw $0x7,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"2:" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" + +"3:" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" +); + +#elif defined(__x86_64__) +static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "lea (%3,%3,2),%%r10 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "movdqa 0x10(%0,%3,2),%%xmm3 \n" + "movdqa (%0,%%r10,1),%%xmm4 \n" + "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" + "lea (%0,%3,4),%%r11 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa 0x0(%%r11),%%xmm2 \n" + "movdqa 0x10(%%r11),%%xmm3 \n" + "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" + "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm4 \n" + "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psadbw %%xmm7,%%xmm0 \n" + "psadbw %%xmm7,%%xmm1 \n" + "pshufd $0xd8,%%xmm0,%%xmm0 \n" + "pshufd $0x8d,%%xmm1,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "psrlw $0x3,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "r10", "r11", "xmm6", "xmm7" +); +} + +#define HAS_SCALEROWDOWN34_SSSE3 +static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%3),%%xmm3 \n" + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf0), // %3 + "r"(_shuf1), // %4 + "r"(_shuf2) // %5 + : "memory", "cc" +); +} + +static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3,1),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shuf01), // %4 + "r"(_shuf11), // %5 + "r"(_shuf21), // %6 + "r"(_madd01), // %7 + "r"(_madd11), // %8 + "r"(_round34), // %9 + "r"(_madd21) // %10 + : "memory", "cc", "xmm6", "xmm7", "xmm8" +); +} + +#define HAS_SCALEROWDOWN38_SSSE3 +static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile( + "movdqa (%3),%%xmm4 \n" + "movdqa (%4),%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(_shuf38a), // %3 + "r"(_shuf38b) // %4 + : "memory", "cc" +); +} + +static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "pxor %%xmm7,%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm1 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "pshufb %%xmm5,%%xmm3 \n" + "paddusw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" + "pextrw $0x2,%%xmm2,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufac0), // %4 + "r"(_shufac3), // %5 + "r"(_scaleac3) // %6 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "movdqa (%7),%%xmm7 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "pavgb (%0,%3,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusw %%xmm1,%%xmm0 \n" + "pshufb %%xmm6,%%xmm2 \n" + "paddusw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "pextrw $0x2,%%xmm0,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"(_shufab0), // %4 + "r"(_shufab1), // %5 + "r"(_shufab2), // %6 + "r"(_scaleab2) // %7 + : "memory", "cc", "rax", "xmm6", "xmm7" +); +} + +#define HAS_SCALEADDROWS_SSE2 +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" +"1:" + "movdqa (%0),%%xmm2 \n" + "lea (%0,%4,1),%%r10 \n" + "movhlps %%xmm2,%%xmm3 \n" + "lea -0x1(%3),%%r11 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + +"2:" + "movdqa (%%r10),%%xmm0 \n" + "lea (%%r10,%4,1),%%r10 \n" + "movhlps %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "paddusw %%xmm0,%%xmm2 \n" + "paddusw %%xmm1,%%xmm3 \n" + "sub $0x1,%%r11 \n" + "ja 2b \n" + + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width), // %2 + "+r"(src_height) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "r10", "r11" +); +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version +#define HAS_SCALEFILTERROWS_SSE2 +static void ScaleFilterRows_SSE2(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile( + "mov %3,%%eax \n" + "movd %%eax,%%xmm6 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "neg %%eax \n" + "add $0x100,%%eax \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm7,%%xmm7 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "punpckhbw %%xmm7,%%xmm3 \n" + "pmullw %%xmm5,%%xmm0 \n" + "pmullw %%xmm5,%%xmm1 \n" + "pmullw %%xmm6,%%xmm2 \n" + "pmullw %%xmm6,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax", "xmm6", "xmm7" + ); + } + return; +} + +// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version +#define HAS_SCALEFILTERROWS_SSSE3 +static void ScaleFilterRows_SSSE3(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + if (source_y_fraction == 0) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "rax" + ); + return; + } else if (source_y_fraction == 128) { + asm volatile( + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", "rax" + ); + return; + } else { + asm volatile( + "mov %3,%%eax \n" + "shr %%eax \n" + "mov %%al,%%ah \n" + "neg %%al \n" + "add $0x80,%%al \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "1:" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", "rax" + ); + } + return; +} +#endif +#endif + +// CPU agnostic row functions +static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 2; + } +} + +static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + + src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + src_ptr += 2; + } +} + +static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 4; + } +} + +static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + + src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + + src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + + src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + 8) >> 4; + src_ptr += 4; + } +} + +// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. +// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. +// The following 2 lines cause error on Windows. +//static const int kMaxOutputWidth = 640; +//static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2; +#define kMaxOutputWidth 640 +#define kMaxRow12 1280 + +static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + *dst++ = *src_ptr; + src_ptr += 8; + } +} + +// Note calling code checks width is less than max and if not +// uses ScaleRowDown8_C instead. +static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + ALIGN16(uint8 src_row[kMaxRow12 * 2]); + assert(dst_width <= kMaxOutputWidth); + ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); + ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, + src_row + kMaxOutputWidth, + dst_width * 2); + ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); +} + +static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + uint8* dend; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst + dst_width; + do { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } while (dst < dend); +} + +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, + uint8* d, int dst_width) { + uint8* dend; + const uint8* s; + const uint8* t; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = d + dst_width; + s = src_ptr; + t = src_ptr + src_stride; + do { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } while (d < dend); +} + +#if defined(HAS_SCALEFILTERROWS_SSE2) +// Filter row to 3/4 +static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width) { + uint8* dend; + const uint8* s; + assert((dst_width % 3 == 0) && (dst_width > 0)); + dend = dst_ptr + dst_width; + s = src_ptr; + do { + dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; + dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; + dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; + dst_ptr += 3; + s += 4; + } while (dst_ptr < dend); +} +#endif + +static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx) { + int x = 0; + int j; + for (j = 0; j < dst_width; ++j) { + int xi = x >> 16; + int xf1 = x & 0xffff; + int xf0 = 65536 - xf1; + + *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; + x += dx; + } +} + +//Not work on Windows +//static const int kMaxInputWidth = 2560; +#define kMaxInputWidth 2560 +#if defined(HAS_SCALEFILTERROWS_SSE2) +#define HAS_SCALEROWDOWN34_SSE2 +// Filter rows 0 and 1 together, 3 : 1 +static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} + +// Filter rows 1 and 2 together, 1 : 1 +static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + ALIGN16(uint8 row[kMaxInputWidth]); + assert((dst_width % 3 == 0) && (dst_width > 0)); + ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); + ScaleFilterCols34_C(dst_ptr, row, dst_width); +} +#endif + +static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride, + uint8* dst, int dst_width) { + int x; + assert(dst_width % 3 == 0); + for (x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + + src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + + src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + + src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + int i; + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (i = 0; i < dst_width; i+=3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + + src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + + src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// C version 8x2 -> 8x1 +static void ScaleFilterRows_C(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + int y1_fraction; + int y0_fraction; + const uint8* src_ptr1; + uint8* end; + assert(dst_width > 0); + y1_fraction = source_y_fraction; + y0_fraction = 256 - y1_fraction; + src_ptr1 = src_ptr + src_stride; + end = dst_ptr + dst_width; + do { + dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; + dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; + dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; + dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; + dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; + dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; + src_ptr += 8; + src_ptr1 += 8; + dst_ptr += 8; + } while (dst_ptr < end); + dst_ptr[0] = dst_ptr[-1]; +} + +void ScaleAddRows_C(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int x,y; + assert(src_width > 0); + assert(src_height > 0); + for (x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + int sum = 0; + for (y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + dst_ptr[x] = sum; + } +} + +/** + * Scale plane, 1/2 + * + * This is an optimized version for scaling down a plane to 1/2 of + * its original size. + * + */ +static void ScalePlaneDown2(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + +#if defined(HAS_SCALEROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + } else +#endif +/* TODO: Force to call C version all the time in ordert to get matching results + * in multi-resolution encoder example. + */ +#if 0 //defined(HAS_SCALEROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 16) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } else +#endif + { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 1); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/4 + * + * This is an optimized version for scaling down a plane to 1/4 of + * its original size. + */ +static void ScalePlaneDown4(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 4)); + assert(IS_ALIGNED(src_height, 4)); + +#if defined(HAS_SCALEROWDOWN4_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(dst_width, 4)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; + } else +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + } else +#endif + { + ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 2); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane, 1/8 + * + * This is an optimized version for scaling down a plane to 1/8 + * of its original size. + * + */ +static void ScalePlaneDown8(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(IS_ALIGNED(src_width, 8)); + assert(IS_ALIGNED(src_height, 8)); + +#if defined(HAS_SCALEROWDOWN8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; + } else +#endif + { + ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? + ScaleRowDown8Int_C : ScaleRowDown8_C; + } + + { + int y; + for (y = 0; y < dst_height; ++y) { + ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += (src_stride << 3); + dst_ptr += dst_stride; + } + } +} + +/** + * Scale plane down, 3/4 + * + * Provided by Frank Barchard (fbarchard@google.com) + * + */ +static void ScalePlaneDown34(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN34_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 24 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN34_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + } + } else +#endif +#if defined(HAS_SCALEROWDOWN34_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && + filtering) { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + } else +#endif + { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_C; + ScaleRowDown34_1 = ScaleRowDown34_C; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 1: + ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + break; + + case 2: + ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + dst_ptr, dst_width); + break; + } + ++src_row; + src_ptr += src_stride; + dst_ptr += dst_stride; + if (src_row >= 3) { + src_ptr += src_stride; + src_row = 0; + } + } + } +} + +/** + * Scale plane, 3/8 + * + * This is an optimized version for scaling down a plane to 3/8 + * of its original size. + * + * Reduces 16x3 to 6x1 + */ +static void ScalePlaneDown38(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width); + assert(dst_width % 3 == 0); +#if defined(HAS_SCALEROWDOWN38_NEON) + if (TestCpuFlag(kCpuHasNEON) && + (dst_width % 12 == 0)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; + } + } else +#endif + +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_stride, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + } + } else +#endif + { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_C; + ScaleRowDown38_2 = ScaleRowDown38_C; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + } + } + { + int src_row = 0; + int y; + for (y = 0; y < dst_height; ++y) { + switch (src_row) { + case 0: + case 1: + ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 3; + ++src_row; + break; + + case 2: + ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + src_ptr += src_stride * 2; + src_row = 0; + break; + } + dst_ptr += dst_stride; + } + } +} + +__inline static uint32 SumBox(int iboxwidth, int iboxheight, + int src_stride, const uint8* src_ptr) { + int x, y; + uint32 sum; + assert(iboxwidth > 0); + assert(iboxheight > 0); + sum = 0u; + for (y = 0; y < iboxheight; ++y) { + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + src_ptr += src_stride; + } + return sum; +} + +static void ScalePlaneBoxRow(int dst_width, int boxheight, + int dx, int src_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / + (boxwidth * boxheight); + } +} + +__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { + uint32 sum; + int x; + assert(iboxwidth > 0); + sum = 0u; + for (x = 0; x < iboxwidth; ++x) { + sum += src_ptr[x]; + } + return sum; +} + +static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int scaletbl[2]; + int minboxwidth = (dx >> 16); + scaletbl[0] = 65536 / (minboxwidth * boxheight); + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + { + int *scaleptr = scaletbl - minboxwidth; + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + int ix = x >> 16; + int boxwidth; + x += dx; + boxwidth = (x >> 16) - ix; + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } + } +} + +static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr) { + int boxwidth = (dx >> 16); + int scaleval = 65536 / (boxwidth * boxheight); + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; + x += boxwidth; + } +} + +/** + * Scale plane down to any dimensions, with interpolation. + * (boxfilter). + * + * Same method as SimpleScale, which is fixed point, outputting + * one pixel of destination using fixed point (16.16) to step + * through source, sampling a box of pixel with simple + * averaging. + */ +static void ScalePlaneBox(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dx, dy; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || + dst_height * 2 > src_height) { + uint8* dst = dst_ptr; + int dy = (src_height << 16) / dst_height; + int dx = (src_width << 16) / dst_width; + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScalePlaneBoxRow(dst_width, boxheight, + dx, src_stride, + src, dst); + + dst += dst_stride; + } + } else { + ALIGN16(uint16 row[kMaxInputWidth]); + void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height); + void (*ScaleAddCols)(int dst_width, int boxheight, int dx, + const uint16* src_ptr, uint8* dst_ptr); +#if defined(HAS_SCALEADDROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleAddRows = ScaleAddRows_SSE2; + } else +#endif + { + ScaleAddRows = ScaleAddRows_C; + } + if (dx & 0xffff) { + ScaleAddCols = ScaleAddCols2_C; + } else { + ScaleAddCols = ScaleAddCols1_C; + } + + { + int y = 0; + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + const uint8* const src = src_ptr + iy * src_stride; + int boxheight; + y += dy; + if (y > (src_height << 16)) { + y = (src_height << 16); + } + boxheight = (y >> 16) - iy; + ScaleAddRows(src, src_stride, row, src_width, boxheight); + ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); + dst_ptr += dst_stride; + } + } + } +} + +/** + * Scale plane to/from any dimensions, with interpolation. + */ +static void ScalePlaneBilinearSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int i, j; + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int dy = (src_height << 16) / dst_height; + int maxx = ((src_width - 1) << 16) - 1; + int maxy = ((src_height - 1) << 16) - 1; + int y = (dst_height < src_height) ? 32768 : + (src_height << 16) / dst_height - 32768; + for (i = 0; i < dst_height; ++i) { + int cy = (y < 0) ? 0 : y; + int yi = cy >> 16; + int yf = cy & 0xffff; + const uint8* const src = src_ptr + yi * src_stride; + int x = (dst_width < src_width) ? 32768 : + (src_width << 16) / dst_width - 32768; + for (j = 0; j < dst_width; ++j) { + int cx = (x < 0) ? 0 : x; + int xi = cx >> 16; + int xf = cx & 0xffff; + int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; + int r1 = (src[xi + src_stride] * (65536 - xf) + + src[xi + src_stride + 1] * xf) >> 16; + *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; + x += dx; + if (x > maxx) + x = maxx; + } + dst += dst_stride - dst_width; + y += dy; + if (y > maxy) + y = maxy; + } +} + +/** + * Scale plane to/from any dimensions, with bilinear + * interpolation. + */ +static void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + int dy; + int dx; + assert(dst_width > 0); + assert(dst_height > 0); + dy = (src_height << 16) / dst_height; + dx = (src_width << 16) / dst_width; + if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { + ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + + } else { + ALIGN16(uint8 row[kMaxInputWidth + 1]); + void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, + int src_stride, + int dst_width, int source_y_fraction); + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int dx); +#if defined(HAS_SCALEFILTERROWS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSSE3; + } else +#endif +#if defined(HAS_SCALEFILTERROWS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_SSE2; + } else +#endif + { + ScaleFilterRows = ScaleFilterRows_C; + } + ScaleFilterCols = ScaleFilterCols_C; + + { + int y = 0; + int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. + int j; + for (j = 0; j < dst_height; ++j) { + int iy = y >> 16; + int fy = (y >> 8) & 255; + const uint8* const src = src_ptr + iy * src_stride; + ScaleFilterRows(row, src, src_stride, src_width, fy); + ScaleFilterCols(dst_ptr, row, dst_width, dx); + dst_ptr += dst_stride; + y += dy; + if (y > maxy) { + y = maxy; + } + } + } + } +} + +/** + * Scale plane to/from any dimensions, without interpolation. + * Fixed point math is used for performance: The upper 16 bits + * of x and dx is the integer part of the source position and + * the lower 16 bits are the fixed decimal part. + */ +static void ScalePlaneSimple(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + uint8* dst = dst_ptr; + int dx = (src_width << 16) / dst_width; + int y; + for (y = 0; y < dst_height; ++y) { + const uint8* const src = src_ptr + (y * src_height / dst_height) * + src_stride; + // TODO(fbarchard): Round X coordinate by setting x=0x8000. + int x = 0; + int i; + for (i = 0; i < dst_width; ++i) { + *dst++ = src[x >> 16]; + x += dx; + } + dst += dst_stride - dst_width; + } +} + +/** + * Scale plane to/from any dimensions. + */ +static void ScalePlaneAnySize(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + // fall back to non-optimized version + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Scale plane down, any size + * + * This is an optimized version for scaling down a plane to any size. + * The current implementation is ~10 times faster compared to the + * reference implementation for e.g. XGA->LowResPAL + * + */ +static void ScalePlaneDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + if (!filtering) { + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { + // between 1/2x and 1x use bilinear + ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } else { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); + } +} + +/** + * Copy plane, no scaling + * + * This simply copies the given plane without scaling. + * The current implementation is ~115 times faster + * compared to the reference implementation. + * + */ +static void CopyPlane(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + if (src_stride == src_width && dst_stride == dst_width) { + // All contiguous, so can use REALLY fast path. + memcpy(dst_ptr, src_ptr, src_width * src_height); + } else { + // Not all contiguous; must copy scanlines individually + const uint8* src = src_ptr; + uint8* dst = dst_ptr; + int i; + for (i = 0; i < src_height; ++i) { + memcpy(dst, src, src_width); + dst += dst_stride; + src += src_stride; + } + } +} + +static void ScalePlane(const uint8* src, int src_stride, + int src_width, int src_height, + uint8* dst, int dst_stride, + int dst_width, int dst_height, + FilterMode filtering, int use_ref) { + // Use specialized scales to improve performance for common resolutions. + // For example, all the 1/2 scalings will use ScalePlaneDown2() + if (dst_width == src_width && dst_height == src_height) { + // Straight copy. + CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, + dst_stride, src, dst); + } else if (dst_width <= src_width && dst_height <= src_height) { + // Scale down. + if (use_ref) { + // For testing, allow the optimized versions to be disabled. + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { + // optimized, 3/4 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + // optimized, 1/2 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + // 3/8 rounded up for odd sized chroma height. + } else if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { + // optimized, 3/8 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + // optimized, 1/4 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + // optimized, 1/8 + ScalePlaneDown8(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } else { + // Arbitrary downsample + ScalePlaneDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } + } else { + // Arbitrary scale up and/or down. + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + } +} + +/** + * Scale a plane. + * + * This function in turn calls a scaling function + * suitable for handling the desired resolutions. + * + */ + +int I420Scale(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int dst_width, int dst_height, + FilterMode filtering) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + { + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + } + return 0; +} + +int Scale_2(const uint8* src_y, const uint8* src_u, const uint8* src_v, + int src_stride_y, int src_stride_u, int src_stride_v, + int src_width, int src_height, + uint8* dst_y, uint8* dst_u, uint8* dst_v, + int dst_stride_y, int dst_stride_u, int dst_stride_v, + int dst_width, int dst_height, + int interpolate) { + int halfsrc_width; + int halfsrc_height; + int halfdst_width; + int halfoheight; + FilterMode filtering; + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + return -1; + } + // Negative height means invert the image. + if (src_height < 0) { + int halfheight; + src_height = -src_height; + halfheight = (src_height + 1) >> 1; + src_y = src_y + (src_height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + halfsrc_width = (src_width + 1) >> 1; + halfsrc_height = (src_height + 1) >> 1; + halfdst_width = (dst_width + 1) >> 1; + halfoheight = (dst_height + 1) >> 1; + filtering = interpolate ? kFilterBox : kFilterNone; + + ScalePlane(src_y, src_stride_y, src_width, src_height, + dst_y, dst_stride_y, dst_width, dst_height, + filtering, use_reference_impl_); + ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, + dst_u, dst_stride_u, halfdst_width, halfoheight, + filtering, use_reference_impl_); + ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, + dst_v, dst_stride_v, halfdst_width, halfoheight, + filtering, use_reference_impl_); + return 0; +} + +int Scale_1(const uint8* src, int src_width, int src_height, + uint8* dst, int dst_width, int dst_height, int ooffset, + int interpolate) { + if (!src || src_width <= 0 || src_height <= 0 || + !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || + ooffset >= dst_height) { + return -1; + } + ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. + { + int halfsrc_width = (src_width + 1) >> 1; + int halfsrc_height = (src_height + 1) >> 1; + int halfdst_width = (dst_width + 1) >> 1; + int halfoheight = (dst_height + 1) >> 1; + int aheight = dst_height - ooffset * 2; // actual output height + const uint8* const iyptr = src; + uint8* oyptr = dst + ooffset * dst_width; + const uint8* const iuptr = src + src_width * src_height; + uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width; + const uint8* const ivptr = src + src_width * src_height + + halfsrc_width * halfsrc_height; + uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight + + (ooffset >> 1) * halfdst_width; + return Scale_2(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width, + src_width, src_height, oyptr, ouptr, ovptr, dst_width, + halfdst_width, halfdst_width, dst_width, aheight, interpolate); + } +} + +//} // namespace libyuv diff --git a/usage.dox b/usage.dox index 0db080b00..9370e428f 100644 --- a/usage.dox +++ b/usage.dox @@ -82,6 +82,7 @@ The available initialization methods are: \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif + \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif diff --git a/usage_cx.dox b/usage_cx.dox index 980a03461..62f3e450b 100644 --- a/usage_cx.dox +++ b/usage_cx.dox @@ -1,6 +1,6 @@ /*! \page usage_encode Encode - The vpx_codec_encode() function is at the core of the decode loop. It + The vpx_codec_encode() function is at the core of the encode loop. It processes raw images passed by the application, producing packets of compressed data. The <code>deadline</code> parameter controls the amount of time in microseconds the encoder should spend working on the frame. For @@ -10,5 +10,4 @@ \ref samples - */ diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index a90c1c0b6..91e90e2a6 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -170,6 +170,18 @@ typedef struct union b_mode_info bmi[16]; } MODE_INFO; +#if CONFIG_MULTI_RES_ENCODING +/* The information needed to be stored for higher-resolution encoder */ +typedef struct +{ + MB_PREDICTION_MODE mode; + MV_REFERENCE_FRAME ref_frame; + int_mv mv; + //union b_mode_info bmi[16]; + int dissim; // dissimilarity level of the macroblock +} LOWER_RES_INFO; +#endif + typedef struct { short *qcoeff; diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 3f04dab4c..37fa5a0cd 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -17,6 +17,7 @@ extern "C" { #endif +#include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" #include "vpx/vpx_encoder.h" @@ -207,6 +208,19 @@ extern "C" unsigned int periodicity; unsigned int layer_id[MAX_PERIODICITY]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of total resolutions encoded */ + unsigned int mr_total_resolutions; + + /* Current encoder ID */ + unsigned int mr_encoder_id; + + /* Down-sampling factor */ + vpx_rational_t mr_down_sampling_factor; + + /* Memory location to store low-resolution encoder's mode info */ + void* mr_low_res_mode_info; +#endif } VP8_CONFIG; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 8ec9e27c9..0927f51cf 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -49,8 +49,8 @@ extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, int count); void vp8_build_block_offsets(MACROBLOCK *x); void vp8_setup_block_ptrs(MACROBLOCK *x); -int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset); -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset, int mb_row, int mb_col); +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col); static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ); #ifdef MODE_STATS @@ -475,14 +475,14 @@ void encode_mb_row(VP8_COMP *cpi, if (cm->frame_type == KEY_FRAME) { - *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp); + *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp, mb_row, mb_col); #ifdef MODE_STATS y_modes[xd->mbmi.mode] ++; #endif } else { - *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset); + *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_row, mb_col); #ifdef MODE_STATS inter_y_modes[xd->mbmi.mode] ++; @@ -1142,7 +1142,7 @@ static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x ) #endif } -int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) +int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int mb_row, int mb_col) { int rate; @@ -1182,7 +1182,8 @@ extern void vp8_fix_contexts(MACROBLOCKD *x); int vp8cx_encode_inter_macroblock ( VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, - int recon_yoffset, int recon_uvoffset + int recon_yoffset, int recon_uvoffset, + int mb_row, int mb_col ) { MACROBLOCKD *const xd = &x->e_mbd; @@ -1230,8 +1231,25 @@ int vp8cx_encode_inter_macroblock } else + { +#if CONFIG_MULTI_RES_ENCODING + if (cpi->oxcf.mr_encoder_id == 0) + { + /* Lowest-resolution encoding */ + vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error); + + }else + { + /* Higher-resolution encoding */ + vp8_mr_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, + &distortion, &intra_error, mb_row, mb_col); + } +#else vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); +#endif + } cpi->prediction_error += distortion; cpi->intra_error += intra_error; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index c1a0ea7bf..9d963832a 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -9,6 +9,7 @@ */ +#include "onyx_int.h" #include "mcomp.h" #include "vpx_mem/vpx_mem.h" #include "vpx_config.h" @@ -182,8 +183,6 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best -#define MIN(x,y) (((x)<(y))?(x):(y)) -#define MAX(x,y) (((x)>(y))?(x):(y)) int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -331,8 +330,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef IFMVCV #undef ERR #undef CHECK_BETTER -#undef MIN -#undef MAX + int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -854,6 +852,8 @@ int vp8_hex_search int k = -1; int all_in; int best_site = -1; + int hex_range = 127; + int dia_range = 8; int_mv fcenter_mv; fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; @@ -873,6 +873,18 @@ int vp8_hex_search in_what_stride, 0x7fffffff) + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); +#if CONFIG_MULTI_RES_ENCODING + /* Lower search range based on prediction info */ + if (search_param >= 6) goto cal_neighbors; + else if (search_param >= 5) hex_range = 4; + else if (search_param >= 4) hex_range = 6; + else if (search_param >= 3) hex_range = 15; + else if (search_param >= 2) hex_range = 31; + else if (search_param >= 1) hex_range = 63; + + dia_range = 8; +#endif + // hex search //j=0 CHECK_BOUNDS(2) @@ -909,7 +921,7 @@ int vp8_hex_search k = best_site; } - for (j = 1; j < 127; j++) + for (j = 1; j < hex_range; j++) { best_site = -1; CHECK_BOUNDS(2) @@ -951,7 +963,7 @@ int vp8_hex_search // check 4 1-away neighbors cal_neighbors: - for (j = 0; j < 32; j++) + for (j = 0; j < dia_range; j++) { best_site = -1; CHECK_BOUNDS(1) diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c new file mode 100644 index 000000000..7a62a06ec --- /dev/null +++ b/vp8/encoder/mr_dissim.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <limits.h> +#include "vpx_config.h" +#include "onyx_int.h" +#include "mr_dissim.h" +#include "vpx_mem/vpx_mem.h" +#include "rdopt.h" + +void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) +{ + int low_res_w; + + /* Support arbitrary down-sampling factor */ + unsigned int iw = cpi->oxcf.Width*cpi->oxcf.mr_down_sampling_factor.den + + cpi->oxcf.mr_down_sampling_factor.num - 1; + + low_res_w = iw/cpi->oxcf.mr_down_sampling_factor.num; + cpi->mr_low_res_mb_cols = ((low_res_w + 15) >> 4); +} + +#define GET_MV(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + cnt++; \ +} + +#define GET_MV_SIGN(x) \ +if(x->mbmi.ref_frame !=INTRA_FRAME) \ +{ \ + mvx[cnt] = x->mbmi.mv.as_mv.row; \ + mvy[cnt] = x->mbmi.mv.as_mv.col; \ + if (cm->ref_frame_sign_bias[x->mbmi.ref_frame] \ + != cm->ref_frame_sign_bias[tmp->mbmi.ref_frame]) \ + { \ + mvx[cnt] *= -1; \ + mvy[cnt] *= -1; \ + } \ + cnt++; \ +} + +void vp8_cal_dissimilarity(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + /* Note: The first row & first column in mip are outside the frame, which + * were initialized to all 0.(ref_frame, mode, mv...) + * Their ref_frame = 0 means they won't be counted in the following + * calculation. + */ + if (cpi->oxcf.mr_total_resolutions >1 + && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1)) + { + /* Store info for show/no-show frames for supporting alt_ref. + * If parent frame is alt_ref, child has one too. + */ + if(cm->frame_type != KEY_FRAME) + { + int mb_row; + int mb_col; + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip + cm->mode_info_stride; + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + + for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++) + { + tmp++; + for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++) + { + int dissim = INT_MAX; + + if(tmp->mbmi.ref_frame !=INTRA_FRAME) + { + int mvx[8]; + int mvy[8]; + int mmvx; + int mmvy; + int cnt=0; + const MODE_INFO *here = tmp; + const MODE_INFO *above = here - cm->mode_info_stride; + const MODE_INFO *left = here - 1; + const MODE_INFO *aboveleft = above - 1; + const MODE_INFO *aboveright = NULL; + const MODE_INFO *right = NULL; + const MODE_INFO *belowleft = NULL; + const MODE_INFO *below = NULL; + const MODE_INFO *belowright = NULL; + + /* If alternate reference frame is used, we have to + * check sign of MV. */ + if(cpi->oxcf.play_alternate) + { + /* Gather mv of neighboring MBs */ + GET_MV_SIGN(above) + GET_MV_SIGN(left) + GET_MV_SIGN(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV_SIGN(right) + GET_MV_SIGN(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV_SIGN(below) + GET_MV_SIGN(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV_SIGN(belowright) + } + }else + { + /* No alt_ref and gather mv of neighboring MBs */ + GET_MV(above) + GET_MV(left) + GET_MV(aboveleft) + + if(mb_col < (cm->mb_cols-1)) + { + right = here + 1; + aboveright = above + 1; + GET_MV(right) + GET_MV(aboveright) + } + + if(mb_row < (cm->mb_rows-1)) + { + below = here + cm->mode_info_stride; + belowleft = below - 1; + GET_MV(below) + GET_MV(belowleft) + } + + if(mb_col < (cm->mb_cols-1) + && mb_row < (cm->mb_rows-1)) + { + belowright = below + 1; + GET_MV(belowright) + } + } + + if (cnt > 0) + { + int max_mvx = mvx[0]; + int min_mvx = mvx[0]; + int max_mvy = mvy[0]; + int min_mvy = mvy[0]; + int i; + + if (cnt > 1) + { + for (i=1; i< cnt; i++) + { + if (mvx[i] > max_mvx) max_mvx = mvx[i]; + else if (mvx[i] < min_mvx) min_mvx = mvx[i]; + if (mvy[i] > max_mvy) max_mvy = mvy[i]; + else if (mvy[i] < min_mvy) min_mvy = mvy[i]; + } + } + + mmvx = MAX(abs(min_mvx - here->mbmi.mv.as_mv.row), + abs(max_mvx - here->mbmi.mv.as_mv.row)); + mmvy = MAX(abs(min_mvy - here->mbmi.mv.as_mv.col), + abs(max_mvy - here->mbmi.mv.as_mv.col)); + dissim = MAX(mmvx, mmvy); + } + } + + /* Store mode info for next resolution encoding */ + store_mode_info->mode = tmp->mbmi.mode; + store_mode_info->ref_frame = tmp->mbmi.ref_frame; + store_mode_info->mv.as_int = tmp->mbmi.mv.as_int; + store_mode_info->dissim = dissim; + tmp++; + store_mode_info++; + } + } + } + } +} diff --git a/vp8/encoder/mr_dissim.h b/vp8/encoder/mr_dissim.h new file mode 100644 index 000000000..3d2c2035f --- /dev/null +++ b/vp8/encoder/mr_dissim.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_MR_DISSIM_H +#define __INC_MR_DISSIM_H +#include "vpx_config.h" + +extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); +extern void vp8_cal_dissimilarity(VP8_COMP *cpi); + +#endif diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 146384b13..d40f009fb 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -36,6 +36,9 @@ #if ARCH_ARM #include "vpx_ports/arm.h" #endif +#if CONFIG_MULTI_RES_ENCODING +#include "mr_dissim.h" +#endif #include <math.h> #include <stdio.h> @@ -2234,6 +2237,13 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) vp8_loop_filter_init(cm); cpi->common.error.setjmp = 0; + +#if CONFIG_MULTI_RES_ENCODING + /* Calculate # of MBs in a row in lower-resolution level image. */ + if (cpi->oxcf.mr_encoder_id > 0) + vp8_cal_low_res_mb_cols(cpi); +#endif + return (VP8_PTR) cpi; } @@ -4338,13 +4348,20 @@ static void encode_frame_to_data_rate IF_RTCD(&cpi->rtcd.variance)); } - // This frame's MVs are saved and will be used in next frame's MV prediction. - // Last frame has one more line(add to bottom) and one more column(add to right) than cm->mip. The edge elements are initialized to 0. - if(cm->show_frame) //do not save for altref frame + /* This frame's MVs are saved and will be used in next frame's MV predictor. + * Last frame has one more line(add to bottom) and one more column(add to + * right) than cm->mip. The edge elements are initialized to 0. + */ +#if CONFIG_MULTI_RES_ENCODING + if(!cpi->oxcf.mr_encoder_id && cm->show_frame) +#else + if(cm->show_frame) /* do not save for altref frame */ +#endif { int mb_row; int mb_col; - MODE_INFO *tmp = cm->mip; //point to beginning of allocated MODE_INFO arrays. + /* Point to beginning of allocated MODE_INFO arrays. */ + MODE_INFO *tmp = cm->mip; if(cm->frame_type != KEY_FRAME) { @@ -4363,6 +4380,10 @@ static void encode_frame_to_data_rate } } +#if CONFIG_MULTI_RES_ENCODING + vp8_cal_dissimilarity(cpi); +#endif + // Update the GF useage maps. // This is done after completing the compression of a frame when all // modes etc. are finalized but before loop filter diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 202f61471..ca36c85af 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -58,6 +58,9 @@ #define MAX_PERIODICITY 16 +#define MAX(x,y) (((x)>(y))?(x):(y)) +#define MIN(x,y) (((x)<(y))?(x):(y)) + typedef struct { int kf_indicated; @@ -679,6 +682,11 @@ typedef struct VP8_COMP double total_ssimg_v_in_layer[MAX_LAYERS]; double total_ssimg_all_in_layer[MAX_LAYERS]; +#if CONFIG_MULTI_RES_ENCODING + /* Number of MBs per row at lower-resolution level */ + int mr_low_res_mb_cols; +#endif + } VP8_COMP; void control_data_rate(VP8_COMP *cpi); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index f92618fae..d9c89750b 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -703,6 +703,14 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, if (cpi->sf.search_method == HEX) { +#if CONFIG_MULTI_RES_ENCODING + /* TODO: In higher-res pick_inter_mode, step_param is used to + * modify hex search range. Here, set step_param to 0 not to + * change the behavior in lowest-resolution encoder. + * Will improve it later. + */ + step_param = 0; +#endif bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); @@ -949,3 +957,568 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_) *rate_ = best_rate; } + +#if CONFIG_MULTI_RES_ENCODING +void vp8_mr_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col) +{ + BLOCK *b = &x->block[0]; + BLOCKD *d = &x->e_mbd.block[0]; + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO best_mbmode; + + int_mv best_ref_mv; + int_mv mode_mv[MB_MODE_COUNT]; + MB_PREDICTION_MODE this_mode; + int num00; + int mdcounts[4]; + int best_rd = INT_MAX; // 1 << 30; + int best_intra_rd = INT_MAX; + int mode_index; + int rate; + int rate2; + int distortion2; + int bestsme; + int best_mode_index = 0; + unsigned int sse = INT_MAX, best_sse = INT_MAX; + + int_mv mvp; + int_mv nearest_mv[4]; + int_mv near_mv[4]; + int_mv frame_best_ref_mv[4]; + int MDCounts[4][4]; + unsigned char *y_buffer[4]; + unsigned char *u_buffer[4]; + unsigned char *v_buffer[4]; + int skip_mode[4] = {0, 0, 0, 0}; + int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, + when Speed >= 15, no sub-pixel search. */ + int lfdone=0, gfdone=0, afdone=0; + + LOWER_RES_INFO* store_mode_info + = (LOWER_RES_INFO*)cpi->oxcf.mr_low_res_mode_info; + unsigned int parent_mb_index; + //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col]; + int dissim; + int parent_ref_frame; + int_mv parent_ref_mv; + MB_PREDICTION_MODE parent_mode; + + /* Consider different down_sampling_factor. */ + { + /* TODO: Removed the loop that supports special down_sampling_factor + * such as 2, 4, 8. Will revisit it if needed. + * Should also try using a look-up table to see if it helps + * performance. */ + int round = cpi->oxcf.mr_down_sampling_factor.num/2; + int parent_mb_row, parent_mb_col; + + parent_mb_row = (mb_row*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_col = (mb_col*cpi->oxcf.mr_down_sampling_factor.den+round) + /cpi->oxcf.mr_down_sampling_factor.num; + parent_mb_index = parent_mb_row*cpi->mr_low_res_mb_cols + parent_mb_col; + } + + /* Read lower-resolution mode & motion result from memory.*/ + parent_ref_frame = store_mode_info[parent_mb_index].ref_frame; + parent_mode = store_mode_info[parent_mb_index].mode; + dissim = store_mode_info[parent_mb_index].dissim; + + /* For highest-resolution encoder, adjust dissim value. Lower its quality + * for good performance. */ + if (cpi->oxcf.mr_encoder_id == (cpi->oxcf.mr_total_resolutions - 1)) + dissim>>=1; + + if(parent_ref_frame != INTRA_FRAME) + { + /* Consider different down_sampling_factor. + * The result can be rounded to be more precise, but it takes more time. + */ + //int round = cpi->oxcf.mr_down_sampling_factor.den/2; + parent_ref_mv.as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + parent_ref_mv.as_mv.col = store_mode_info[parent_mb_index].mv.as_mv.col + *cpi->oxcf.mr_down_sampling_factor.num + /cpi->oxcf.mr_down_sampling_factor.den; + + vp8_clamp_mv2(&parent_ref_mv, xd); + } + + vpx_memset(mode_mv, 0, sizeof(mode_mv)); + vpx_memset(nearest_mv, 0, sizeof(nearest_mv)); + vpx_memset(near_mv, 0, sizeof(near_mv)); + vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); + + cpi->mbs_tested_so_far++; + + *returnintra = INT_MAX; + x->skip = 0; + + x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; + + // if we encode a new mv this is important + // find the best new motion vector + for (mode_index = 0; mode_index < MAX_MODES; mode_index++) + { + int frame_cost; + int this_rd = INT_MAX; + + if (best_rd <= cpi->rd_threshes[mode_index]) + continue; + + /* If parent MB is intra, child MB is intra. */ + if (!parent_ref_frame && vp8_ref_frame_order[mode_index]) + continue; + + /* If parent MB is inter, and it is unlikely there are multiple objects + * in parent MB, we use parent ref frame as child MB's ref frame. */ + if (parent_ref_frame && dissim < 8 + && parent_ref_frame != vp8_ref_frame_order[mode_index]) + continue; + + x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index]; + + if(x->e_mbd.mode_info_context->mbmi.ref_frame) + { + if(x->e_mbd.mode_info_context->mbmi.ref_frame==LAST_FRAME && !lfdone) + { + // set up all the refframe dependent pointers. + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME + //&& (cpi->ref_frame_flags & VP8_LAST_FLAG)) + if (cpi->ref_frame_flags & VP8_LAST_FLAG) + { + YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME], + &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], + LAST_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset; + u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset; + v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[LAST_FRAME] = 1; + + lfdone = 1; + } + + if(x->e_mbd.mode_info_context->mbmi.ref_frame==GOLDEN_FRAME && !gfdone) + { + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME + //&& (cpi->ref_frame_flags & VP8_GOLD_FLAG)) + if (cpi->ref_frame_flags & VP8_GOLD_FLAG) + { + YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME], + &frame_best_ref_mv[GOLDEN_FRAME],MDCounts[GOLDEN_FRAME], + GOLDEN_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset; + u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset; + v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[GOLDEN_FRAME] = 1; + + gfdone = 1; + } + + if(x->e_mbd.mode_info_context->mbmi.ref_frame==ALTREF_FRAME && !afdone) + { + //if (x->e_mbd.mode_info_context->mbmi.ref_frame == ALTREF_FRAME + //&& (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active)) + if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active) + { + YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx]; + + vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, + &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME], + &frame_best_ref_mv[ALTREF_FRAME],MDCounts[ALTREF_FRAME], + ALTREF_FRAME, cpi->common.ref_frame_sign_bias); + + y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset; + u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset; + v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset; + } + else + skip_mode[ALTREF_FRAME] = 1; + + afdone = 1; + } + + if (skip_mode[x->e_mbd.mode_info_context->mbmi.ref_frame]) + continue; + + x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame]; + mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame]; + memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts)); + + if (vp8_mode_order[mode_index] == NEARESTMV && mode_mv[NEARESTMV].as_int ==0) + continue; + if (vp8_mode_order[mode_index] == NEARMV && mode_mv[NEARMV].as_int ==0) + continue; + + if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV + && best_ref_mv.as_int==0) //&& dissim==0 + continue; + else if(vp8_mode_order[mode_index] == NEWMV && dissim==0 + && best_ref_mv.as_int==parent_ref_mv.as_int) + continue; + } + + // Check to see if the testing frequency for this mode is at its max + // If so then prevent it from being tested and increase the threshold for its testing + if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1)) + { + //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] ) + if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])) + { + // Increase the threshold for coding this mode to make it less likely to be chosen + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + + continue; + } + } + + // We have now reached the point where we are going to test the current + //mode so increment the counter for the number of times it has been tested + cpi->mode_test_hit_counts[mode_index] ++; + + rate2 = 0; + distortion2 = 0; + + this_mode = vp8_mode_order[mode_index]; + + x->e_mbd.mode_info_context->mbmi.mode = this_mode; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + + // Work out the cost assosciated with selecting the reference frame + frame_cost = + x->e_mbd.ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame]; + rate2 += frame_cost; + + // Only consider ZEROMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative + if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) + { + if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME) + continue; + } + + switch (this_mode) + { + case B_PRED: + // Pass best so far to pick_intra4x4mby_modes to use as breakout + distortion2 = best_sse; + pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2); + + if (distortion2 == INT_MAX) + { + this_rd = INT_MAX; + } + else + { + rate2 += rate; + distortion2 = VARIANCE_INVOKE + (&cpi->rtcd.variance, var16x16)( + *(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + } + + break; + + case DC_PRED: + case V_PRED: + case H_PRED: + case TM_PRED: + RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) + (&x->e_mbd); + distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) + (*(b->base_src), b->src_stride, + x->e_mbd.predictor, 16, &sse); + rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (this_rd < best_intra_rd) + { + best_intra_rd = this_rd; + *returnintra = distortion2; + } + break; + + case NEWMV: + { + int thissme; + int step_param; + int further_steps; + int n = 0; + int sadpb = x->sadperbit16; + int_mv mvp_full; + + int col_min = (best_ref_mv.as_mv.col>>3) - MAX_FULL_PEL_VAL + + ((best_ref_mv.as_mv.col & 7)?1:0); + int row_min = (best_ref_mv.as_mv.row>>3) - MAX_FULL_PEL_VAL + + ((best_ref_mv.as_mv.row & 7)?1:0); + int col_max = (best_ref_mv.as_mv.col>>3) + MAX_FULL_PEL_VAL; + int row_max = (best_ref_mv.as_mv.row>>3) + MAX_FULL_PEL_VAL; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + + int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1; + int diff_mv = MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row), + abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)); + + // Further step/diamond searches as necessary + step_param = cpi->sf.first_step + speed_adjust; //sf->first_step = 1; for -6 step_param =3; + + // Use parent MV as predictor. Adjust search range accordingly. + mvp.as_int = parent_ref_mv.as_int; + mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3; + mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3; + + if(dissim <=32) step_param += 3; + else if(dissim <=128) step_param += 2; + else step_param += 1; + + if(dissim >2 || diff_mv >4) + { + /* Get intersection of UMV window and valid MV window to + * reduce # of checks in diamond search. */ + if (x->mv_col_min < col_min ) + x->mv_col_min = col_min; + if (x->mv_col_max > col_max ) + x->mv_col_max = col_max; + if (x->mv_row_min < row_min ) + x->mv_row_min = row_min; + if (x->mv_row_max > row_max ) + x->mv_row_max = row_max; + + further_steps = (cpi->Speed >= 8)? + 0: (cpi->sf.max_step_search_steps - 1 - step_param); + + if (cpi->sf.search_method == HEX) + { + bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv, step_param, + sadpb, &cpi->fn_ptr[BLOCK_16X16], + x->mvsadcost, x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv, + step_param, sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + // Further step/diamond searches as necessary + n = 0; + //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + n = num00; + num00 = 0; + + while (n < further_steps) + { + n++; + + if (num00) + num00--; + else + { + thissme = + cpi->diamond_search_sad(x, b, d, &mvp_full, + &d->bmi.mv, + step_param + n, + sadpb, &num00, + &cpi->fn_ptr[BLOCK_16X16], + x->mvcost, &best_ref_mv); + if (thissme < bestsme) + { + bestsme = thissme; + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + } + else + { + d->bmi.mv.as_int = mode_mv[NEWMV].as_int; + } + } + } + } + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + }else + { + d->bmi.mv.as_int = mvp_full.as_int; + mode_mv[NEWMV].as_int = mvp_full.as_int; + } + + // This is not needed. + //if (bestsme < INT_MAX) + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv, &best_ref_mv, + x->errorperbit, + &cpi->fn_ptr[BLOCK_16X16], + cpi->mb.mvcost, + &distortion2,&sse); + + mode_mv[NEWMV].as_int = d->bmi.mv.as_int; + + // mv cost; + rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128); + } + + case NEARESTMV: + case NEARMV: + // Trap vectors that reach beyond the UMV borders + // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops + // through to this point because of the lack of break statements + // in the previous two cases. + if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || + ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) || + ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || + ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) + continue; + + case ZEROMV: + rate2 += vp8_cost_mv_ref(this_mode, mdcounts); + x->e_mbd.mode_info_context->mbmi.mv.as_int = + mode_mv[this_mode].as_int; + + if((this_mode != NEWMV) || + !(have_subp_search) || cpi->common.full_pixel==1) + distortion2 = get_inter_mbpred_error(x, + &cpi->fn_ptr[BLOCK_16X16], + &sse, mode_mv[this_mode]); + + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); + + if (cpi->active_map_enabled && x->active_ptr[0] == 0) + { + x->skip = 1; + } + else if (sse < x->encode_breakout) + { + // Check u and v to make sure skip is ok + int sse2 = 0; + + sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance)); + + if (sse2 * 2 < x->encode_breakout) + x->skip = 1; + else + x->skip = 0; + } + + break; + default: + break; + } + + if (this_rd < best_rd || x->skip) + { + // Note index of best mode + best_mode_index = mode_index; + + *returnrate = rate2; + *returndistortion = distortion2; + best_sse = sse; + best_rd = this_rd; + vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO)); + + // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time + cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT; + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around. + else + { + cpi->rd_thresh_mult[mode_index] += 4; + + if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT) + cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT; + + cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index]; + } + + if (x->skip) + break; + } + + // Reduce the activation RD thresholds for the best choice mode + if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) + { + int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3); + + cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT; + cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index]; + } + + + { + int this_rdbin = (*returndistortion >> 7); + + if (this_rdbin >= 1024) + { + this_rdbin = 1023; + } + + cpi->error_bins[this_rdbin] ++; + } + + if (cpi->is_src_frame_alt_ref && + (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) + { + x->e_mbd.mode_info_context->mbmi.mode = ZEROMV; + x->e_mbd.mode_info_context->mbmi.ref_frame = ALTREF_FRAME; + x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED; + x->e_mbd.mode_info_context->mbmi.mb_skip_coeff = + (cpi->common.mb_no_coeff_skip) ? 1 : 0; + x->e_mbd.mode_info_context->mbmi.partitioning = 0; + + return; + } + + /* set to the best mb mode */ + vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); + + if (best_mbmode.mode <= B_PRED) + { + /* set mode_info_context->mbmi.uv_mode */ + pick_intra_mbuv_mode(x); + } + + update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]); +} +#endif diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h index 1c5d6a6e6..df6042fd5 100644 --- a/vp8/encoder/pickinter.h +++ b/vp8/encoder/pickinter.h @@ -14,6 +14,16 @@ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" -extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); +extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra); extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); + +#if CONFIG_MULTI_RES_ENCODING +extern void vp8_mr_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, + int recon_yoffset, int recon_uvoffset, + int *returnrate, int *returndistortion, + int *returnintra, int mb_row, int mb_col); +#endif + #endif diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 379ffe0a8..62a2cfc53 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1463,57 +1463,6 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, return bsi.segment_rd; } -static void insertsortmv(int arr[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp; - - temp = arr[i]; - - for ( k = i; k >j; k--) - arr[k] = arr[k - 1] ; - - arr[j] = temp ; - } - } - } -} - -static void insertsortsad(int arr[],int idx[], int len) -{ - int i, j, k; - - for ( i = 1 ; i <= len-1 ; i++ ) - { - for ( j = 0 ; j < i ; j++ ) - { - if ( arr[j] > arr[i] ) - { - int temp, tempi; - - temp = arr[i]; - tempi = idx[i]; - - for ( k = i; k >j; k--) - { - arr[k] = arr[k - 1] ; - idx[k] = idx[k - 1]; - } - - arr[j] = temp ; - idx[j] = tempi; - } - } - } -} - //The improved MV prediction void vp8_mv_pred ( diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h index 95134cb81..5ee869903 100644 --- a/vp8/encoder/rdopt.h +++ b/vp8/encoder/rdopt.h @@ -14,6 +14,57 @@ #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) +static void insertsortmv(int arr[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp; + + temp = arr[i]; + + for ( k = i; k >j; k--) + arr[k] = arr[k - 1] ; + + arr[j] = temp ; + } + } + } +} + +static void insertsortsad(int arr[],int idx[], int len) +{ + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp, tempi; + + temp = arr[i]; + tempi = idx[i]; + + for ( k = i; k >j; k--) + { + arr[k] = arr[k - 1] ; + idx[k] = idx[k - 1]; + } + + arr[j] = temp ; + idx[j] = tempi; + } + } + } +} + extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue); extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate); diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 4f21e1456..5bb6b4099 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -264,7 +264,8 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, vpx_codec_enc_cfg_t cfg, - struct vp8_extracfg vp8_cfg) + struct vp8_extracfg vp8_cfg, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { oxcf->multi_threaded = cfg.g_threads; oxcf->Version = cfg.g_profile; @@ -355,6 +356,17 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id)); } +#if CONFIG_MULTI_RES_ENCODING + if(mr_cfg) + { + oxcf->mr_total_resolutions = mr_cfg->mr_total_resolutions; + oxcf->mr_encoder_id = mr_cfg->mr_encoder_id; + oxcf->mr_down_sampling_factor.num = mr_cfg->mr_down_sampling_factor.num; + oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den; + oxcf->mr_low_res_mode_info = mr_cfg->mr_low_res_mode_info; + } +#endif + //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); @@ -432,7 +444,7 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx, if (!res) { ctx->cfg = *cfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } @@ -498,14 +510,38 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx, if (!res) { ctx->vp8_cfg = xcfg; - set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg); + set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL); vp8_change_config(ctx->cpi, &ctx->oxcf); } return res; #undef MAP } -static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) + +static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc) +{ + vpx_codec_err_t res = 0; + +#if CONFIG_MULTI_RES_ENCODING + int mb_rows = ((cfg->g_w + 15) >>4); + int mb_cols = ((cfg->g_h + 15) >>4); + + *mem_loc = calloc(mb_rows*mb_cols, sizeof(LOWER_RES_INFO)); + if(!(*mem_loc)) + { + free(*mem_loc); + res = VPX_CODEC_MEM_ERROR; + } + else + res = VPX_CODEC_OK; +#endif + + return res; +} + +static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *mr_cfg) { vpx_codec_err_t res = VPX_DEC_OK; struct vpx_codec_alg_priv *priv; @@ -570,9 +606,16 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) if (!res) { + if(mr_cfg) + ctx->priv->enc.total_encoders = mr_cfg->mr_total_resolutions; + else + ctx->priv->enc.total_encoders = 1; + set_vp8e_config(&ctx->priv->alg_priv->oxcf, ctx->priv->alg_priv->cfg, - ctx->priv->alg_priv->vp8_cfg); + ctx->priv->alg_priv->vp8_cfg, + mr_cfg); + optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf); if (!optr) @@ -587,6 +630,11 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) static vpx_codec_err_t vp8e_destroy(vpx_codec_alg_priv_t *ctx) { +#if CONFIG_MULTI_RES_ENCODING + /* Free multi-encoder shared memory */ + if (ctx->oxcf.mr_total_resolutions > 0 && (ctx->oxcf.mr_encoder_id == ctx->oxcf.mr_total_resolutions-1)) + free(ctx->oxcf.mr_low_res_mode_info); +#endif free(ctx->cx_data); vp8_remove_compressor(&ctx->cpi); @@ -1223,6 +1271,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; @@ -1307,5 +1356,6 @@ vpx_codec_iface_t vpx_enc_vp8_algo = vp8e_set_config, NOT_IMPLEMENTED, vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index cdfcd2142..54bdb8568 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -181,9 +181,11 @@ static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx) /* nothing to clean up */ } -static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx) +static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data) { vpx_codec_err_t res = VPX_CODEC_OK; + (void) data; /* This function only allocates space for the vpx_codec_alg_priv_t * structure. More memory may be required at the time the stream @@ -564,7 +566,7 @@ static vpx_codec_err_t vp8_xma_set_mmap(vpx_codec_ctx_t *ctx, if (done && !res) { vp8_finalize_mmaps(ctx->priv->alg_priv); - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); } return res; diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index b71a54aea..2d99981f5 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -86,6 +86,8 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c VP8_CX_SRCS-yes += encoder/temporal_filter.c VP8_CX_SRCS-yes += encoder/temporal_filter.h +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c +VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c new file mode 100644 index 000000000..732f96e38 --- /dev/null +++ b/vp8_multi_resolution_encoder.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * This is an example demonstrating multi-resolution encoding in VP8. + * High-resolution input video is down-sampled to lower-resolutions. The + * encoder then encodes the video and outputs multiple bitstreams with + * different resolutions. + */ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include "math.h" +#define VPX_CODEC_DISABLE_COMPAT 1 +#include "vpx/vpx_encoder.h" +#include "vpx/vp8cx.h" +#include "vpx_ports/mem_ops.h" +#define interface (vpx_codec_vp8_cx()) +#define fourcc 0x30385056 + +#define IVF_FILE_HDR_SZ (32) +#define IVF_FRAME_HDR_SZ (12) + +/* + * The input video frame is downsampled several times to generate a multi-level + * hierarchical structure. NUM_ENCODERS is defined as the number of encoding + * levels required. For example, if the size of input video is 1280x720, + * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3 + * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and + * 320x180(level 2) respectively. + */ +#define NUM_ENCODERS 3 + +/* This example uses the scaler function in libyuv. */ +#include "third_party/libyuv/include/libyuv/basic_types.h" +#include "third_party/libyuv/include/libyuv/scale.h" +#include "third_party/libyuv/include/libyuv/cpu_id.h" + +static double vp8_mse2psnr(double Samples, double Peak, double Mse) +{ + double psnr; + + if ((double)Mse > 0.0) + psnr = 10.0 * log10(Peak * Peak * Samples / Mse); + else + psnr = 60; // Limit to prevent / 0 + + if (psnr > 60) + psnr = 60; + + return psnr; +} + +static void die(const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + if(fmt[strlen(fmt)-1] != '\n') + printf("\n"); + exit(EXIT_FAILURE); +} + +static void die_codec(vpx_codec_ctx_t *ctx, const char *s) { + const char *detail = vpx_codec_error_detail(ctx); + + printf("%s: %s\n", s, vpx_codec_error(ctx)); + if(detail) + printf(" %s\n",detail); + exit(EXIT_FAILURE); +} + +static int read_frame(FILE *f, vpx_image_t *img) { + size_t nbytes, to_read; + int res = 1; + + to_read = img->w*img->h*3/2; + nbytes = fread(img->planes[0], 1, to_read, f); + if(nbytes != to_read) { + res = 0; + if(nbytes > 0) + printf("Warning: Read partial frame. Check your width & height!\n"); + } + return res; +} + +static void write_ivf_file_header(FILE *outfile, + const vpx_codec_enc_cfg_t *cfg, + int frame_cnt) { + char header[32]; + + if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) + return; + header[0] = 'D'; + header[1] = 'K'; + header[2] = 'I'; + header[3] = 'F'; + mem_put_le16(header+4, 0); /* version */ + mem_put_le16(header+6, 32); /* headersize */ + mem_put_le32(header+8, fourcc); /* headersize */ + mem_put_le16(header+12, cfg->g_w); /* width */ + mem_put_le16(header+14, cfg->g_h); /* height */ + mem_put_le32(header+16, cfg->g_timebase.den); /* rate */ + mem_put_le32(header+20, cfg->g_timebase.num); /* scale */ + mem_put_le32(header+24, frame_cnt); /* length */ + mem_put_le32(header+28, 0); /* unused */ + + if(fwrite(header, 1, 32, outfile)); +} + +static void write_ivf_frame_header(FILE *outfile, + const vpx_codec_cx_pkt_t *pkt) +{ + char header[12]; + vpx_codec_pts_t pts; + + if(pkt->kind != VPX_CODEC_CX_FRAME_PKT) + return; + + pts = pkt->data.frame.pts; + mem_put_le32(header, pkt->data.frame.sz); + mem_put_le32(header+4, pts&0xFFFFFFFF); + mem_put_le32(header+8, pts >> 32); + + if(fwrite(header, 1, 12, outfile)); +} + +int main(int argc, char **argv) +{ + FILE *infile, *outfile[NUM_ENCODERS]; + vpx_codec_ctx_t codec[NUM_ENCODERS]; + vpx_codec_enc_cfg_t cfg[NUM_ENCODERS]; + vpx_codec_pts_t frame_cnt = 0; + vpx_image_t raw[NUM_ENCODERS]; + vpx_codec_err_t res[NUM_ENCODERS]; + + int i; + long width; + long height; + int frame_avail; + int got_data; + int flags = 0; + + /*Currently, only realtime mode is supported in multi-resolution encoding.*/ + int arg_deadline = VPX_DL_REALTIME; + + /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you + don't need to know PSNR, which will skip PSNR calculation and save + encoding time. */ + int show_psnr = 0; + uint64_t psnr_sse_total[NUM_ENCODERS] = {0}; + uint64_t psnr_samples_total[NUM_ENCODERS] = {0}; + double psnr_totals[NUM_ENCODERS][4] = {{0,0}}; + int psnr_count[NUM_ENCODERS] = {0}; + + /* Set the required target bitrates for each resolution level. */ + unsigned int target_bitrate[NUM_ENCODERS]={1400, 500, 100}; + /* Enter the frame rate of the input video */ + int framerate = 30; + /* Set down-sampling factor for each resolution level. + dsf[0] controls down sampling from level 0 to level 1; + dsf[1] controls down sampling from level 1 to level 2; + dsf[2] is not used. */ + vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}}; + + if(argc!= (5+NUM_ENCODERS)) + die("Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n", + argv[0]); + + printf("Using %s\n",vpx_codec_iface_name(interface)); + + width = strtol(argv[1], NULL, 0); + height = strtol(argv[2], NULL, 0); + + if(width < 16 || width%2 || height <16 || height%2) + die("Invalid resolution: %ldx%ld", width, height); + + /* Open input video file for encoding */ + if(!(infile = fopen(argv[3], "rb"))) + die("Failed to open %s for reading", argv[3]); + + /* Open output file for each encoder to output bitstreams */ + for (i=0; i< NUM_ENCODERS; i++) + { + if(!(outfile[i] = fopen(argv[i+4], "wb"))) + die("Failed to open %s for writing", argv[i+4]); + } + + show_psnr = strtol(argv[NUM_ENCODERS + 4], NULL, 0); + + /* Populate default encoder configuration */ + for (i=0; i< NUM_ENCODERS; i++) + { + res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0); + if(res[i]) { + printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i])); + return EXIT_FAILURE; + } + } + + /* + * Update the default configuration according to needs of the application. + */ + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].g_threads = 1; /* number of threads used */ + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 4; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 98; + cfg[0].rc_overshoot_pct = 100; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + //cfg[0].rc_dropframe_thresh = 10; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + /* Disable automatic keyframe placement */ + //cfg[0].kf_mode = VPX_KF_DISABLED; + cfg[0].kf_min_dist = cfg[0].kf_max_dist = 1000; + + cfg[0].rc_target_bitrate = target_bitrate[0]; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + /* Other-resolution encoder settings */ + for (i=1; i< NUM_ENCODERS; i++) + { + memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t)); + + cfg[i].g_threads = 1; /* number of threads used */ + cfg[i].rc_target_bitrate = target_bitrate[i]; + + /* Note: Width & height of other-resolution encoders are calculated + * from the highest-resolution encoder's size and the corresponding + * down_sampling_factor. + */ + { + unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1; + unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1; + cfg[i].g_w = iw/dsf[i-1].num; + cfg[i].g_h = ih/dsf[i-1].num; + } + + /* Make width & height to be multiplier of 2. */ + // Should support odd size ??? + if((cfg[i].g_w)%2)cfg[i].g_w++; + if((cfg[i].g_h)%2)cfg[i].g_h++; + } + + /* Allocate image for each encoder */ + for (i=0; i< NUM_ENCODERS; i++) + if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 1)) + die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h); + + for (i=0; i< NUM_ENCODERS; i++) + write_ivf_file_header(outfile[i], &cfg[i], 0); + + /* Initialize multi-encoder */ + if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS, + (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0])) + die_codec(&codec[0], "Failed to initialize encoder"); + + /* The extra encoding configuration parameters can be set as follows. */ + /* Set encoding speed */ + for ( i=0; i<NUM_ENCODERS; i++) + { + int speed = -6; + if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed)) + die_codec(&codec[i], "Failed to set cpu_used"); + } + /* Set static thresh for highest-resolution encoder. Set it to 1000 for + * better performance. */ + { + unsigned int static_thresh = 1000; + if(vpx_codec_control(&codec[0], VP8E_SET_STATIC_THRESHOLD, static_thresh)) + die_codec(&codec[0], "Failed to set static threshold"); + } + /* Set static thresh = 0 for other encoders for better quality */ + for ( i=1; i<NUM_ENCODERS; i++) + { + unsigned int static_thresh = 0; + if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh)) + die_codec(&codec[i], "Failed to set static threshold"); + } + + frame_avail = 1; + got_data = 0; + + while(frame_avail || got_data) + { + vpx_codec_iter_t iter[NUM_ENCODERS]={NULL}; + const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS]; + + flags = 0; + frame_avail = read_frame(infile, &raw[0]); + + for ( i=1; i<NUM_ENCODERS; i++) + { + if(frame_avail) + { + /*Scale the image down a number of times by downsampling factor*/ + int src_uvwidth = (raw[i-1].d_w + 1) >> 1; + int src_uvheight = (raw[i-1].d_h + 1) >> 1; + const unsigned char* src_y = raw[i-1].planes[VPX_PLANE_Y]; + const unsigned char* src_u = raw[i-1].planes[VPX_PLANE_Y] + + raw[i-1].d_w*raw[i-1].d_h; + const unsigned char* src_v = raw[i-1].planes[VPX_PLANE_Y] + + raw[i-1].d_w*raw[i-1].d_h + + src_uvwidth*src_uvheight; + int dst_uvwidth = (raw[i].d_w + 1) >> 1; + int dst_uvheight = (raw[i].d_h + 1) >> 1; + unsigned char* dst_y = raw[i].planes[VPX_PLANE_Y]; + unsigned char* dst_u = raw[i].planes[VPX_PLANE_Y] + + raw[i].d_w*raw[i].d_h; + unsigned char* dst_v = raw[i].planes[VPX_PLANE_Y] + + raw[i].d_w*raw[i].d_h + + dst_uvwidth*dst_uvheight; + + /* FilterMode 1 or 2 give better psnr than FilterMode 0. */ + I420Scale(src_y, raw[i-1].d_w, src_u, src_uvwidth, src_v, + src_uvwidth, raw[i-1].d_w, raw[i-1].d_h, + dst_y, raw[i].d_w, dst_u, dst_uvwidth, + dst_v, dst_uvwidth, raw[i].d_w, raw[i].d_h, 1); + } + } + + /* Encode each frame at multi-levels */ + if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL, + frame_cnt, 1, flags, arg_deadline)) + die_codec(&codec[0], "Failed to encode frame"); + + for (i=NUM_ENCODERS-1; i>=0 ; i--) + { + got_data = 0; + + while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) ) + { + got_data = 1; + switch(pkt[i]->kind) { + case VPX_CODEC_CX_FRAME_PKT: + write_ivf_frame_header(outfile[i], pkt[i]); + if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz, + outfile[i])); + break; + case VPX_CODEC_PSNR_PKT: + if (show_psnr) + { + int j; + + psnr_sse_total[i] += pkt[i]->data.psnr.sse[0]; + psnr_samples_total[i] += pkt[i]->data.psnr.samples[0]; + for (j = 0; j < 4; j++) + { + //fprintf(stderr, "%.3lf ", pkt[i]->data.psnr.psnr[j]); + psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j]; + } + psnr_count[i]++; + } + + break; + default: + break; + } + printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT + && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"."); + fflush(stdout); + } + } + frame_cnt++; + } + printf("\n"); + + fclose(infile); + + for (i=0; i< NUM_ENCODERS; i++) + { + printf("Processed %ld frames.\n",(long int)frame_cnt-1); + + /* Calculate PSNR and print it out */ + if ( (show_psnr) && (psnr_count[i]>0) ) + { + int j; + double ovpsnr = vp8_mse2psnr(psnr_samples_total[i], 255.0, + psnr_sse_total[i]); + + fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i); + + fprintf(stderr, " %.3lf", ovpsnr); + for (j = 0; j < 4; j++) + { + fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]); + } + } + + if(vpx_codec_destroy(&codec[i])) + die_codec(&codec[i], "Failed to destroy codec"); + + /* Try to rewrite the file header with the actual frame count */ + if(!fseek(outfile[i], 0, SEEK_SET)) + write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1); + fclose(outfile[i]); + + vpx_img_free(&raw[i]); + } + + return EXIT_SUCCESS; +} diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index a1ff1921e..0703d6a4f 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -56,9 +56,10 @@ * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_CODEC_INTERNAL_ABI_VERSION (3) /**<\hideinitializer*/ +#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; +typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t; /*!\brief init function pointer prototype * @@ -73,7 +74,8 @@ typedef struct vpx_codec_alg_priv vpx_codec_alg_priv_t; * \retval #VPX_CODEC_MEM_ERROR * Memory operation failed. */ -typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx); +typedef vpx_codec_err_t (*vpx_codec_init_fn_t)(vpx_codec_ctx_t *ctx, + vpx_codec_priv_enc_mr_cfg_t *data); /*!\brief destroy function pointer prototype * @@ -264,6 +266,10 @@ typedef vpx_fixed_buf_t * typedef vpx_image_t * (*vpx_codec_get_preview_frame_fn_t)(vpx_codec_alg_priv_t *ctx); +typedef vpx_codec_err_t +(*vpx_codec_enc_mr_get_mem_loc_fn_t)(const vpx_codec_enc_cfg_t *cfg, + void **mem_loc); + /*!\brief usage configuration mapping * * This structure stores the mapping between usage identifiers and @@ -309,8 +315,9 @@ struct vpx_codec_iface vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ vpx_codec_enc_config_set_fn_t cfg_set; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ - vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_enc_config_set_fn_t */ + vpx_codec_get_global_headers_fn_t get_glob_hdrs; /**< \copydoc ::vpx_codec_get_global_headers_fn_t */ vpx_codec_get_preview_frame_fn_t get_preview; /**< \copydoc ::vpx_codec_get_preview_frame_fn_t */ + vpx_codec_enc_mr_get_mem_loc_fn_t mr_get_mem_loc; /**< \copydoc ::vpx_codec_enc_mr_get_mem_loc_fn_t */ } enc; }; @@ -353,9 +360,21 @@ struct vpx_codec_priv unsigned int cx_data_pad_before; unsigned int cx_data_pad_after; vpx_codec_cx_pkt_t cx_data_pkt; + unsigned int total_encoders; } enc; }; +/* + * Multi-resolution encoding internal configuration + */ +struct vpx_codec_priv_enc_mr_cfg +{ + unsigned int mr_total_resolutions; + unsigned int mr_encoder_id; + struct vpx_rational mr_down_sampling_factor; + void* mr_low_res_mode_info; +}; + #undef VPX_CTRL_USE_TYPE #define VPX_CTRL_USE_TYPE(id, typ) \ static typ id##__value(va_list args) {return va_arg(args, typ);} \ diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c index 5d31c2c49..59a783dd9 100644 --- a/vpx/src/vpx_decoder.c +++ b/vpx/src/vpx_decoder.c @@ -56,7 +56,7 @@ vpx_codec_err_t vpx_codec_dec_init_ver(vpx_codec_ctx_t *ctx, if (!(flags & VPX_CODEC_USE_XMA)) { - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 5e86835ea..bddad23ec 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -51,7 +51,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, ctx->priv = NULL; ctx->init_flags = flags; ctx->config.enc = cfg; - res = ctx->iface->init(ctx); + res = ctx->iface->init(ctx, NULL); if (res) { @@ -66,6 +66,85 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx, return SAVE_STATUS(ctx, res); } +vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver) +{ + vpx_codec_err_t res = 0; + + if (ver != VPX_ENCODER_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1)) + res = VPX_CODEC_INVALID_PARAM; + else if (iface->abi_version != VPX_CODEC_INTERNAL_ABI_VERSION) + res = VPX_CODEC_ABI_MISMATCH; + else if (!(iface->caps & VPX_CODEC_CAP_ENCODER)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_XMA) && !(iface->caps & VPX_CODEC_CAP_XMA)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_PSNR) + && !(iface->caps & VPX_CODEC_CAP_PSNR)) + res = VPX_CODEC_INCAPABLE; + else if ((flags & VPX_CODEC_USE_OUTPUT_PARTITION) + && !(iface->caps & VPX_CODEC_CAP_OUTPUT_PARTITION)) + res = VPX_CODEC_INCAPABLE; + else + { + int i; + void *mem_loc = NULL; + + if(!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) + { + for (i = 0; i < num_enc; i++) + { + vpx_codec_priv_enc_mr_cfg_t mr_cfg; + + /* Validate down-sampling factor. */ + if(dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || + dsf->den > dsf->num) + { + res = VPX_CODEC_INVALID_PARAM; + break; + } + + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc-1-i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); + + if (res) + { + ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL; + vpx_codec_destroy(ctx); + } + + if (ctx->priv) + ctx->priv->iface = ctx->iface; + + if (res) + break; + + ctx++; + cfg++; + dsf++; + } + } + } + + return SAVE_STATUS(ctx, res); +} vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, @@ -123,7 +202,7 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, vpx_enc_frame_flags_t flags, unsigned long deadline) { - vpx_codec_err_t res; + vpx_codec_err_t res = 0; if (!ctx || (img && !duration)) res = VPX_CODEC_INVALID_PARAM; @@ -136,9 +215,36 @@ vpx_codec_err_t vpx_codec_encode(vpx_codec_ctx_t *ctx, /* Execute in a normalized floating point environment, if the platform * requires it. */ + unsigned int num_enc =ctx->priv->enc.total_encoders; + FLOATING_POINT_INIT(); - res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, - duration, flags, deadline); + + if (num_enc == 1) + res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline); + else + { + /* Multi-resolution encoding: + * Encode multi-levels in reverse order. For example, + * if mr_total_resolutions = 3, first encode level 2, + * then encode level 1, and finally encode level 0. + */ + int i; + + ctx += num_enc - 1; + if (img) img += num_enc - 1; + + for (i = num_enc-1; i >= 0; i--) + { + if ((res = ctx->iface->enc.encode(ctx->priv->alg_priv, img, pts, + duration, flags, deadline))) + break; + + ctx--; + if (img) img--; + } + } + FLOATING_POINT_RESTORE(); } diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 87ab20c75..885ca229f 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -634,7 +634,6 @@ extern "C" { * then ts_layer_id = (0,1,0,1,0,1,0,1). */ unsigned int ts_layer_id[MAX_PERIODICITY]; - } vpx_codec_enc_cfg_t; /**< alias for struct vpx_codec_enc_cfg */ @@ -675,6 +674,48 @@ extern "C" { vpx_codec_enc_init_ver(ctx, iface, cfg, flags, VPX_ENCODER_ABI_VERSION) + /*!\brief Initialize multi-encoder instance + * + * Initializes multi-encoder context using the given interface. + * Applications should call the vpx_codec_enc_init_multi convenience macro + * instead of this function directly, to ensure that the ABI version number + * parameter is properly initialized. + * + * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags + * parameter), the storage pointed to by the cfg parameter must be + * kept readable and stable until all memory maps have been set. + * + * \param[in] ctx Pointer to this instance's context. + * \param[in] iface Pointer to the algorithm interface to use. + * \param[in] cfg Configuration to use, if known. May be NULL. + * \param[in] num_enc Total number of encoders. + * \param[in] flags Bitfield of VPX_CODEC_USE_* flags + * \param[in] dsf Pointer to down-sampling factors. + * \param[in] ver ABI version number. Must be set to + * VPX_ENCODER_ABI_VERSION + * \retval #VPX_CODEC_OK + * The decoder algorithm initialized. + * \retval #VPX_CODEC_MEM_ERROR + * Memory allocation failed. + */ + vpx_codec_err_t vpx_codec_enc_init_multi_ver(vpx_codec_ctx_t *ctx, + vpx_codec_iface_t *iface, + vpx_codec_enc_cfg_t *cfg, + int num_enc, + vpx_codec_flags_t flags, + vpx_rational_t *dsf, + int ver); + + + /*!\brief Convenience macro for vpx_codec_enc_init_multi_ver() + * + * Ensures the ABI version parameter is properly set. + */ +#define vpx_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \ + vpx_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf, \ + VPX_ENCODER_ABI_VERSION) + + /*!\brief Get a default configuration * * Initializes a encoder configuration structure with default values. Supports @@ -780,7 +821,6 @@ extern "C" { vpx_enc_frame_flags_t flags, unsigned long deadline); - /*!\brief Set compressed data output buffer * * Sets the buffer that the codec should output the compressed data