Updates libyuv to version 1005

Also adds compile check and a libyuv configure flag

Change-Id: Ib9f0f4a71c4083e6f0aea7b5a5d175531ef0f66b
This commit is contained in:
Deb Mukherjee 2014-05-16 18:52:01 -07:00
parent ef750d8472
commit 47031c0a54
28 changed files with 32514 additions and 4290 deletions

11
configure vendored
View File

@ -52,6 +52,7 @@ Advanced options:
${toggle_multi_res_encoding} enable multiple-resolution encoding
${toggle_temporal_denoising} enable temporal denoising and disable the spatial denoiser
${toggle_webm_io} enable input from and output to WebM container
${toggle_libyuv} enable libyuv
Codecs:
Codecs can be selectively enabled or disabled individually, or by family:
@ -315,6 +316,7 @@ CONFIG_LIST="
os_support
unit_tests
webm_io
libyuv
decode_perf_tests
multi_res_encoding
temporal_denoising
@ -368,6 +370,7 @@ CMDLINE_SELECT="
postproc_visualizer
unit_tests
webm_io
libyuv
decode_perf_tests
multi_res_encoding
temporal_denoising
@ -709,9 +712,11 @@ process_toolchain() {
*-vs*)
soft_enable unit_tests
soft_enable webm_io
soft_enable libyuv
;;
*-android-*)
soft_enable webm_io
soft_enable libyuv
# GTestLog must be modified to use Android logging utilities.
;;
*-darwin-*)
@ -727,6 +732,9 @@ int z;
EOF
check_cxx "$@" <<EOF && soft_enable webm_io
int z;
EOF
check_cxx "$@" <<EOF && soft_enable libyuv
int z;
EOF
;;
*)
@ -735,6 +743,9 @@ int z;
EOF
check_cxx "$@" <<EOF && soft_enable webm_io
int z;
EOF
check_cxx "$@" <<EOF && soft_enable libyuv
int z;
EOF
;;
esac

View File

@ -10,10 +10,24 @@
LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/include/libyuv/cpu_id.h \
third_party/libyuv/include/libyuv/planar_functions.h \
third_party/libyuv/include/libyuv/row.h \
third_party/libyuv/include/libyuv/scale.h \
third_party/libyuv/source/row.h \
third_party/libyuv/source/scale.c \
third_party/libyuv/source/cpu_id.c
third_party/libyuv/include/libyuv/scale_row.h \
third_party/libyuv/source/cpu_id.cc \
third_party/libyuv/source/planar_functions.cc \
third_party/libyuv/source/row_any.cc \
third_party/libyuv/source/row_common.cc \
third_party/libyuv/source/row_mips.cc \
third_party/libyuv/source/row_neon.cc \
third_party/libyuv/source/row_posix.cc \
third_party/libyuv/source/row_win.cc \
third_party/libyuv/source/scale.cc \
third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \
third_party/libyuv/source/scale_posix.cc \
third_party/libyuv/source/scale_win.cc
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
third_party/libwebm/mkvmuxerutil.cpp \
@ -42,7 +56,9 @@ vpxdec.SRCS += args.c args.h
vpxdec.SRCS += ivfdec.c ivfdec.h
vpxdec.SRCS += tools_common.c tools_common.h
vpxdec.SRCS += y4menc.c y4menc.h
vpxdec.SRCS += $(LIBYUV_SRCS)
ifeq ($(CONFIG_LIBYUV),yes)
vpxdec.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS)
vpxdec.SRCS += webmdec.cc webmdec.h
@ -60,7 +76,9 @@ vpxenc.SRCS += vpx_ports/mem_ops.h
vpxenc.SRCS += vpx_ports/mem_ops_aligned.h
vpxenc.SRCS += vpx_ports/vpx_timer.h
vpxenc.SRCS += vpxstats.c vpxstats.h
vpxenc.SRCS += $(LIBYUV_SRCS)
ifeq ($(CONFIG_LIBYUV),yes)
vpxenc.SRCS += $(LIBYUV_SRCS)
endif
ifeq ($(CONFIG_WEBM_IO),yes)
vpxenc.SRCS += $(LIBWEBM_MUXER_SRCS)
vpxenc.SRCS += webmenc.cc webmenc.h
@ -160,11 +178,13 @@ vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame
ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
ifeq ($(CONFIG_LIBYUV),yes)
EXAMPLES-$(CONFIG_VP8_DECODER) += vp8_multi_resolution_encoder.c
vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS)
vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de
vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding
endif
endif
# Handle extra library flags depending on codec configuration

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 102
Version: 1005
License: BSD
License File: LICENSE
@ -13,5 +13,5 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
Modified the original scaler code from C++ to C to fit in our current build
system. This is a temporal solution, and will be improved later.
Modified the original scaler code minimally with include file changes to fit
in our current build system.

View File

@ -1,22 +1,25 @@
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
#include <stddef.h> // for NULL, size_t
#if !(defined(_MSC_VER) && (_MSC_VER < 1600))
#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
#include <sys/types.h> // for uintptr_t on x86
#else
#include <stdint.h> // for uintptr_t
#endif
#ifndef GG_LONGLONG
#ifndef INT_TYPES_DEFINED
#define INT_TYPES_DEFINED
#ifdef COMPILER_MSVC
@ -30,9 +33,9 @@ typedef __int64 int64;
#endif
#define INT64_F "I64"
#else // COMPILER_MSVC
#ifdef __LP64__
typedef unsigned long uint64;
typedef long int64;
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long uint64; // NOLINT
typedef long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## L
#endif
@ -40,9 +43,9 @@ typedef long int64;
#define UINT64_C(x) x ## UL
#endif
#define INT64_F "l"
#else // __LP64__
typedef unsigned long long uint64;
typedef long long int64;
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
typedef unsigned long long uint64; // NOLINT
typedef long long int64; // NOLINT
#ifndef INT64_C
#define INT64_C(x) x ## LL
#endif
@ -54,20 +57,62 @@ typedef long long int64;
#endif // COMPILER_MSVC
typedef unsigned int uint32;
typedef int int32;
typedef unsigned short uint16;
typedef short int16;
typedef unsigned short uint16; // NOLINT
typedef short int16; // NOLINT
typedef unsigned char uint8;
typedef char int8;
typedef signed char int8;
#endif // INT_TYPES_DEFINED
#endif // GG_LONGLONG
// Detect compiler is for x86 or x64.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86)
#define CPU_X86 1
#endif
// Detect compiler is for ARM.
#if defined(__arm__) || defined(_M_ARM)
#define CPU_ARM 1
#endif
#ifndef ALIGNP
#ifdef __cplusplus
#define ALIGNP(p, t) \
((uint8*)((((uintptr_t)(p) + \
((t)-1)) & ~((t)-1))))
(reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
((t) - 1)) & ~((t) - 1))))
#else
#define ALIGNP(p, t) \
((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1)))) /* NOLINT */
#endif
#endif
#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
#if !defined(LIBYUV_API)
#if defined(_WIN32) || defined(__CYGWIN__)
#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
#define LIBYUV_API __declspec(dllexport)
#elif defined(LIBYUV_USING_SHARED_LIBRARY)
#define LIBYUV_API __declspec(dllimport)
#else
#define LIBYUV_API
#endif // LIBYUV_BUILDING_SHARED_LIBRARY
#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
(defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
defined(LIBYUV_USING_SHARED_LIBRARY))
#define LIBYUV_API __attribute__ ((visibility ("default")))
#else
#define LIBYUV_API
#endif // __GNUC__
#endif // LIBYUV_API
#define LIBYUV_BOOL int
#define LIBYUV_FALSE 0
#define LIBYUV_TRUE 1
// Visual C x86 or GCC little endian.
#if defined(__x86_64__) || defined(_M_X64) || \
defined(__i386__) || defined(_M_IX86) || \
defined(__arm__) || defined(_M_ARM) || \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#define LIBYUV_LITTLE_ENDIAN
#endif
#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT

View File

@ -1,49 +1,81 @@
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_CPU_ID_H_
#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
#define INCLUDE_LIBYUV_CPU_ID_H_
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// These flags are only valid on x86 processors
static const int kCpuHasSSE2 = 1;
static const int kCpuHasSSSE3 = 2;
// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
#define kCpuInit 0x1
// These flags are only valid on ARM processors
static const int kCpuHasNEON = 4;
// These flags are only valid on ARM processors.
static const int kCpuHasARM = 0x2;
static const int kCpuHasNEON = 0x4;
// 0x8 reserved for future ARM flag.
// Internal flag to indicate cpuid is initialized.
static const int kCpuInitialized = 8;
// These flags are only valid on x86 processors.
static const int kCpuHasX86 = 0x10;
static const int kCpuHasSSE2 = 0x20;
static const int kCpuHasSSSE3 = 0x40;
static const int kCpuHasSSE41 = 0x80;
static const int kCpuHasSSE42 = 0x100;
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasMIPS_DSP = 0x20000;
static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
int InitCpuFlags(void);
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
// Detect CPU has SSE2 etc.
// test_flag parameter should be one of kCpuHas constants above
// Test_flag parameter should be one of kCpuHas constants above.
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
extern int cpu_info_;
extern int InitCpuFlags();
return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;
LIBYUV_API extern int cpu_info_;
return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// -1 to enable all cpu specific optimizations.
// 0 to disable all cpu specific optimizations.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(0) to disable all cpu specific optimizations.
LIBYUV_API
void MaskCpuFlags(int enable_flags);
// Low level cpuid for X86. Returns zeros on other CPUs.
// eax is the info type that you want.
// ecx is typically the cpu number, and should normally be zero.
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_CPU_ID_H_
#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT

View File

@ -0,0 +1,439 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
#include "basic_types.h"
// TODO(fbarchard): Remove the following headers includes.
// #include "convert.h"
// #include "convert_argb.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Copy a plane of data.
LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
LIBYUV_API
void CopyPlane_16(const uint16* src_y, int src_stride_y,
uint16* dst_y, int dst_stride_y,
int width, int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Copy I422 to I422.
#define I422ToI422 I422Copy
LIBYUV_API
int I422Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Copy I444 to I444.
#define I444ToI444 I444Copy
LIBYUV_API
int I444Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert UYVY to I422.
LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alias
#define I420ToI420Mirror I420Mirror
// I420 mirror.
LIBYUV_API
int I420Mirror(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Alias
#define I400ToI400Mirror I400Mirror
// I400 mirror. A single plane is mirrored horizontally.
// Pass negative height to achieve 180 degree rotation.
LIBYUV_API
int I400Mirror(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alias
#define ARGBToARGBMirror ARGBMirror
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
int I422ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height);
// Convert I422 to ABGR.
LIBYUV_API
int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert I422 to RGBA.
LIBYUV_API
int I422ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int x, int y, int width, int height,
int value_y, int value_u, int value_v);
// Draw a rectangle into ARGB.
LIBYUV_API
int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height, uint32 value);
// Convert ARGB to gray scale ARGB.
LIBYUV_API
int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Make a rectangle of ARGB gray scale.
LIBYUV_API
int ARGBGray(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
// The first 4 coefficients apply to B, G, R, A and produce B of the output.
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The next 4 coefficients apply to B, G, R, A and produce R of the output.
// The last 4 coefficients apply to B, G, R, A and produce A of the output.
LIBYUV_API
int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
const int8* matrix_argb,
int width, int height);
// Deprecated. Use ARGBColorMatrix instead.
// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
// The first 4 coefficients apply to B, G, R, A and produce B of the output.
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The last 4 coefficients apply to B, G, R, A and produce R of the output.
LIBYUV_API
int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
const int8* matrix_rgb,
int x, int y, int width, int height);
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
// Apply a luma/color table each ARGB pixel but preserve destination alpha.
// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
// RGB (YJ style) and C is an 8 bit color component (R, G or B).
LIBYUV_API
int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
const uint8* luma_rgb_table,
int width, int height);
// Apply a 3 term polynomial to ARGB values.
// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
// g squared, r squared and a squared. The 4rd row is coefficients for b to
// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
// result clamped to 0 to 255.
// A polynomial approximation can be dirived using software such as 'R'.
LIBYUV_API
int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
const float* poly,
int width, int height);
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
// interval_offset should be a value between 0 and 255.
LIBYUV_API
int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
int scale, int interval_size, int interval_offset,
int x, int y, int width, int height);
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
// Get function to Alpha Blend ARGB pixels and store to destination.
LIBYUV_API
ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Add ARGB image with ARGB image. Saturates to 255.
LIBYUV_API
int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
LIBYUV_API
int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I422 to YUY2.
LIBYUV_API
int I422ToYUY2(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Convert I422 to UYVY.
LIBYUV_API
int I422ToUYVY(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Convert unattentuated ARGB to preattenuated ARGB.
LIBYUV_API
int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert MJPG to ARGB.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* argb, int argb_stride,
int w, int h, int dw, int dh);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height);
// Blur ARGB image.
// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
// 16 byte boundary.
// dst_stride32_cumsum is number of ints in a row (width * 4).
// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
// Blur is optimized for radius of 5 (11x11) or less.
LIBYUV_API
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int32* dst_cumsum, int dst_stride32_cumsum,
int width, int height, int radius);
// Multiply ARGB image by ARGB value.
LIBYUV_API
int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
// Interpolate between two ARGB images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
// and 255 means 1% src_argb0 and 99% src_argb1.
// Internally uses ARGBScale bilinear filtering.
// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#define LIBYUV_DISABLE_X86
#endif
// Row functions for copying a pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
#define HAS_ARGBAFFINEROW_SSE2
#endif // LIBYUV_DISABLE_X86
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.
LIBYUV_API
int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb,
const uint8* shuffler, int width, int height);
// Sobel ARGB effect with planar output.
LIBYUV_API
int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Sobel ARGB effect.
LIBYUV_API
int ARGBSobel(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
LIBYUV_API
int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT

1704
third_party/libyuv/include/libyuv/row.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,30 +1,45 @@
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_SCALE_H_
#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_H_
#include "third_party/libyuv/include/libyuv/basic_types.h"
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Supported filtering
typedef enum {
kFilterNone = 0, // Point sample; Fastest
kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
kFilterBox = 2 // Highest quality
// Supported filtering.
typedef enum FilterMode {
kFilterNone = 0, // Point sample; Fastest.
kFilterLinear = 1, // Filter horizontally only.
kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
kFilterBox = 3 // Highest quality.
} FilterModeEnum;
// Scale a YUV plane.
LIBYUV_API
void ScalePlane(const uint8* src, int src_stride,
int src_width, int src_height,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
@ -35,6 +50,7 @@ typedef enum {
// quality image, at further expense of speed.
// Returns 0 if successful.
LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
@ -43,28 +59,44 @@ int I420Scale(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int dst_width, int dst_height,
FilterModeEnum filtering);
enum FilterMode filtering);
// Legacy API. Deprecated
LIBYUV_API
int I420Scale_16(const uint16* src_y, int src_stride_y,
const uint16* src_u, int src_stride_u,
const uint16* src_v, int src_stride_v,
int src_width, int src_height,
uint16* dst_y, int dst_stride_y,
uint16* dst_u, int dst_stride_u,
uint16* dst_v, int dst_stride_v,
int dst_width, int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int src_stride_y, int src_stride_u, int src_stride_v,
int src_width, int src_height,
uint8* dst_y, uint8* dst_u, uint8* dst_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
int interpolate);
LIBYUV_BOOL interpolate);
// Legacy API. Deprecated
int ScaleOffset(const uint8* src, int src_width, int src_height,
uint8* dst, int dst_width, int dst_height, int dst_yoffset,
int interpolate);
// Legacy API. Deprecated.
LIBYUV_API
int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
LIBYUV_BOOL interpolate);
// For testing, allow disabling of optimizations.
void SetUseReferenceImpl(int use);
// For testing, allow disabling of specialized scalers.
LIBYUV_API
void SetUseReferenceImpl(LIBYUV_BOOL use);
#endif // __cplusplus
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_SCALE_H_
#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT

View File

@ -0,0 +1,341 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#define LIBYUV_DISABLE_X86
#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN4_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEADDROWS_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_FIXEDDIV_X86
#define HAS_FIXEDDIV1_X86
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_MIPS_DSPR2
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
#endif
// Scale ARGB vertically with bilinear interpolation.
void ScalePlaneVertical(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
int x, int y, int dy,
int bpp, enum FilterMode filtering);
void ScalePlaneVertical_16(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_argb, uint16* dst_argb,
int x, int y, int dy,
int wpp, enum FilterMode filtering);
// Simplify the filtering based on scale factors.
enum FilterMode ScaleFilterReduce(int src_width, int src_height,
int dst_width, int dst_height,
enum FilterMode filtering);
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_C(int num, int div);
int FixedDiv_X86(int num, int div);
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div);
int FixedDiv1_X86(int num, int div);
#ifdef HAS_FIXEDDIV_X86
#define FixedDiv FixedDiv_X86
#define FixedDiv1 FixedDiv1_X86
#else
#define FixedDiv FixedDiv_C
#define FixedDiv1 FixedDiv1_C
#endif
// Compute slope values for stepping.
void ScaleSlope(int src_width, int src_height,
int dst_width, int dst_height,
enum FilterMode filtering,
int* x, int* y, int* dx, int* dy);
void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* d, int dst_width);
void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int, int);
void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int, int);
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int, int);
void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Row functions.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
// Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT

View File

@ -1,81 +0,0 @@
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/cpu_id.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifdef __ANDROID__
#include <cpu-features.h>
#endif
#include "third_party/libyuv/include/libyuv/basic_types.h" // for CPU_X86
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
static inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
}
#elif defined(__i386__) || defined(__x86_64__)
static inline void __cpuid(int cpu_info[4], int info_type) {
asm volatile (
"cpuid \n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type)
);
}
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// CPU detect function for SIMD instruction sets.
int cpu_info_ = 0;
int InitCpuFlags() {
#ifdef CPU_X86
int cpu_info[4];
__cpuid(cpu_info, 1);
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
kCpuInitialized;
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
uint64_t features = android_getCpuFeatures();
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
kCpuInitialized;
#elif defined(__ARM_NEON__)
// gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it.
cpu_info_ = kCpuHasNEON | kCpuInitialized;
#else
cpu_info_ = kCpuInitialized;
#endif
return cpu_info_;
}
void MaskCpuFlags(int enable_flags) {
InitCpuFlags();
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

283
third_party/libyuv/source/cpu_id.cc vendored Normal file
View File

@ -0,0 +1,283 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/cpu_id.h"
#ifdef _MSC_VER
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && defined(_M_X64) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
#if !defined(__native_client__)
#include <stdlib.h> // For getenv()
#endif
// For ArmCpuCaps() but unittested on all platforms
#include <stdio.h>
#include <string.h>
#include "third_party/libyuv/include/libyuv/basic_types.h" // For CPU_X86
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
#endif
// Low level cpuid for X86. Returns zeros on other CPUs.
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
(defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__))
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
mov eax, info_eax
mov ecx, info_ecx
mov edi, cpu_info
cpuid
mov [edi], eax
mov [edi + 4], ebx
mov [edi + 8], ecx
mov [edi + 12], edx
}
#else
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
}
#endif
#else // defined(_MSC_VER)
uint32 info_ebx, info_edx;
asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
"cpuid \n"
"xchg %%edi, %%ebx \n"
: "=D" (info_ebx),
#else
"cpuid \n"
: "=b" (info_ebx),
#endif // defined( __i386__) && defined(__PIC__)
"+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
cpu_info[0] = info_eax;
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
cpu_info[3] = info_edx;
#endif // defined(_MSC_VER)
}
#if !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // !defined(__native_client__)
#else
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
char cpuinfo_line[512];
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
// Assume Neon if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasNEON;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
}
}
fclose(f);
return 0;
}
#if defined(__mips__) && defined(__linux__)
static int MipsCpuCaps(const char* search_string) {
char cpuinfo_line[512];
const char* file_name = "/proc/cpuinfo";
FILE* f = fopen(file_name, "r");
if (!f) {
// Assume DSP if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasMIPS_DSP;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
return kCpuHasMIPS_DSP;
}
}
fclose(f);
return 0;
}
#endif
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
#if !defined(__native_client__) && !defined(_M_ARM)
static LIBYUV_BOOL TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
if (var[0] != '0') {
return LIBYUV_TRUE;
}
}
return LIBYUV_FALSE;
}
#else // nacl does not support getenv().
static LIBYUV_BOOL TestEnv(const char*) {
return LIBYUV_FALSE;
}
#endif
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
CpuId(1, 0, cpu_info1);
CpuId(7, 0, cpu_info7);
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
#ifdef HAS_XGETBV
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
TestOsSaveYmm()) { // Saves YMM.
cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
kCpuHasAVX;
}
#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info_ &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info_ &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info_ &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info_ &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info_ &= ~kCpuHasAVX2;
}
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info_ &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info_ &= ~kCpuHasFMA3;
}
#elif defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
cpu_info_ |= kCpuHasMIPS_DSPR2;
#endif
cpu_info_ |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_MIPS")) {
cpu_info_ &= ~kCpuHasMIPS;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
cpu_info_ &= ~kCpuHasMIPS_DSP;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
#elif defined(__arm__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#endif
cpu_info_ |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info_ &= ~kCpuHasNEON;
}
#endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = 0;
}
return cpu_info_;
}
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,264 +0,0 @@
/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef LIBYUV_SOURCE_ROW_H_
#define LIBYUV_SOURCE_ROW_H_
#include "third_party/libyuv/include/libyuv/basic_types.h"
#define kMaxStride (2048 * 4)
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)
#define YUV_DISABLE_ASM
#endif
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_FASTCONVERTYUVTOARGBROW_NEON
void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#define HAS_FASTCONVERTYUVTOBGRAROW_NEON
void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#define HAS_FASTCONVERTYUVTOABGRROW_NEON
void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
// The following are available on all x86 platforms
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(YUV_DISABLE_ASM)
#define HAS_ABGRTOARGBROW_SSSE3
#define HAS_BGRATOARGBROW_SSSE3
#define HAS_BG24TOARGBROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOUVROW_SSSE3
#define HAS_RAWTOUVROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_FASTCONVERTYTOARGBROW_SSE2
#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3
#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
#define HAS_REVERSE_ROW_SSSE3
#endif
// The following are available on Neon platforms
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
#define HAS_REVERSE_ROW_NEON
#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
#define HASRGB24TOYROW_SSSE3
#endif
#ifdef HASRGB24TOYROW_SSSE3
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#endif
#ifdef HAS_REVERSE_ROW_SSSE3
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
#endif
#ifdef HAS_REVERSE_ROW_NEON
void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
#endif
void ReverseRow_C(const uint8* src, uint8* dst, int width);
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width);
#ifdef HAS_BG24TOARGBROW_SSSE3
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
#endif
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
#endif
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
#if defined(_MSC_VER)
#define SIMD_ALIGNED(var) __declspec(align(16)) var
typedef __declspec(align(16)) signed char vec8[16];
typedef __declspec(align(16)) unsigned char uvec8[16];
typedef __declspec(align(16)) signed short vec16[8];
#else // __GNUC__
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
typedef signed char __attribute__((vector_size(16))) vec8;
typedef unsigned char __attribute__((vector_size(16))) uvec8;
typedef signed short __attribute__((vector_size(16))) vec16;
#endif
//extern "C"
SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
//extern "C"
SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
//extern "C"
SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_C(const uint8* y_buf,
uint8* rgb_buf,
int width);
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
uint8* rgb_buf,
int width);
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // LIBYUV_SOURCE_ROW_H_

542
third_party/libyuv/source/row_any.cc vendored Normal file
View File

@ -0,0 +1,542 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/row.h"
#include "third_party/libyuv/include/libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
// TODO(fbarchard): Consider 'any' functions handling odd alignment.
// YUV to RGB does multiple of 8 with SIMD and remainder with C.
#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, \
const uint8* u_buf, \
const uint8* v_buf, \
uint8* rgb_buf, \
int width) { \
int n = width & ~MASK; \
I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
I420TORGB_C(y_buf + n, \
u_buf + (n >> UV_SHIFT), \
v_buf + (n >> UV_SHIFT), \
rgb_buf + n * BPP, width & MASK); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
0, 4, 7)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
1, 4, 7)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
2, 4, 7)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
1, 4, 7)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
1, 4, 7)
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
1, 4, 7)
// I422ToRGB565Row_SSSE3 is unaligned.
YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
1, 2, 7)
YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
1, 2, 7)
// I422ToRGB24Row_SSSE3 is unaligned.
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
#ifdef HAS_I422TOARGBROW_NEON
YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
1, 2, 7)
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_NEON
#undef YANY
// Wrappers to handle odd width
#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
void NAMEANY(const uint8* y_buf, \
const uint8* uv_buf, \
uint8* rgb_buf, \
int width) { \
int n = width & ~7; \
NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
NV12TORGB_C(y_buf + n, \
uv_buf + (n >> UV_SHIFT), \
rgb_buf + n * BPP, width & 7); \
}
#ifdef HAS_NV12TOARGBROW_SSSE3
NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
0, 4)
NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
0, 4)
#endif // HAS_NV12TOARGBROW_SSSE3
#ifdef HAS_NV12TOARGBROW_NEON
NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
#endif // HAS_NV12TOARGBROW_NEON
#ifdef HAS_NV12TORGB565ROW_SSSE3
NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
0, 2)
NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
0, 2)
#endif // HAS_NV12TORGB565ROW_SSSE3
#ifdef HAS_NV12TORGB565ROW_NEON
NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
#endif // HAS_NV12TORGB565ROW_NEON
#undef NVANY
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* src, \
uint8* dst, \
int width) { \
int n = width & ~MASK; \
ARGBTORGB_SIMD(src, dst, n); \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
15, 4, 3)
RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
15, 4, 3)
RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
3, 4, 2)
RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
3, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
#endif
#if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
#endif
#if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
15, 2, 4)
RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
15, 2, 4)
// These require alignment on ARGB, so C is used for remainder.
RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
15, 3, 4)
RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
15, 3, 4)
RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
7, 2, 4)
RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
7, 2, 4)
RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
7, 2, 4)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
7, 4, 2)
RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
7, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4)
RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
7, 2, 4)
RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
7, 2, 4)
#endif
#undef RGBANY
// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* src, \
uint8* dst, uint32 selector, \
int width) { \
int n = width & ~MASK; \
ARGBTORGB_SIMD(src, dst, selector, n); \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
}
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
7, 4, 1)
#endif
#if defined(HAS_ARGBTOBAYERROW_NEON)
BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
7, 4, 1)
#endif
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
7, 4, 1)
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
7, 4, 1)
#endif
#undef BAYERANY
// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
dst_y + (width - NUM) * BPP, NUM); \
}
#ifdef HAS_ARGBTOYROW_AVX2
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif
#undef YANY
#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
int n = width & ~MASK; \
ARGBTOY_SIMD(src_argb, dst_y, n); \
ARGBTOY_C(src_argb + n * SBPP, \
dst_y + n * BPP, width & MASK); \
}
// Attenuate is destructive so last16 method can not be used due to overlap.
#ifdef HAS_ARGBATTENUATEROW_SSSE3
YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_AVX2
YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
4, 4, 7)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
4, 4, 7)
#endif
#ifdef HAS_ARGBATTENUATEROW_NEON
YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
4, 4, 7)
#endif
#undef YANY
// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
dst_u + (n >> 1), \
dst_v + (n >> 1), \
width & MASK); \
}
#ifdef HAS_ARGBTOUVROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
4, 15)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
void NAMEANY(const uint8* src_uv, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
ANYTOUV_C(src_uv + n * BPP, \
dst_u + (n >> SHIFT), \
dst_v + (n >> SHIFT), \
width & MASK); \
}
#ifdef HAS_ARGBTOUV444ROW_SSSE3
UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
ARGBToUV444Row_C, 4, 15, 0)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
YUY2ToUV422Row_C, 2, 31, 1)
UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
UYVYToUV422Row_C, 2, 31, 1)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
ARGBToUV422Row_C, 4, 15, 1)
UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
YUY2ToUV422Row_C, 2, 15, 1)
UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
UYVYToUV422Row_C, 2, 15, 1)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
ARGBToUV444Row_C, 4, 7, 0)
UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
ARGBToUV422Row_C, 4, 15, 1)
UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
ARGBToUV411Row_C, 4, 31, 2)
UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
YUY2ToUV422Row_C, 2, 15, 1)
UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
UYVYToUV422Row_C, 2, 15, 1)
#endif
#undef UV422ANY
#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
void NAMEANY(const uint8* src_uv, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
ANYTOUV_C(src_uv + n * 2, \
dst_u + n, \
dst_v + n, \
width & MASK); \
}
#ifdef HAS_SPLITUVROW_SSE2
SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
#endif
#ifdef HAS_SPLITUVROW_NEON
SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_MIPS_DSPR2
SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
SplitUVRow_C, 15)
#endif
#undef SPLITUVROWANY
#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
void NAMEANY(const uint8* src_u, const uint8* src_v, \
uint8* dst_uv, int width) { \
int n = width & ~MASK; \
ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
ANYTOUV_C(src_u + n, \
src_v + n, \
dst_uv + n * 2, \
width & MASK); \
}
#ifdef HAS_MERGEUVROW_SSE2
MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
#endif
#ifdef HAS_MERGEUVROW_NEON
MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
#endif
#undef MERGEUVROW_ANY
#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \
void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
uint8* dst_argb, int width) { \
int n = width & ~MASK; \
ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
ARGBMATH_C(src_argb0 + n * 4, \
src_argb1 + n * 4, \
dst_argb + n * 4, \
width & MASK); \
}
#ifdef HAS_ARGBMULTIPLYROW_SSE2
MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
3)
#endif
#ifdef HAS_ARGBADDROW_SSE2
MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
#endif
#ifdef HAS_ARGBSUBTRACTROW_SSE2
MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
3)
#endif
#ifdef HAS_ARGBMULTIPLYROW_AVX2
MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
7)
#endif
#ifdef HAS_ARGBADDROW_AVX2
MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_AVX2
MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
7)
#endif
#ifdef HAS_ARGBMULTIPLYROW_NEON
MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
7)
#endif
#ifdef HAS_ARGBADDROW_NEON
MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
#endif
#ifdef HAS_ARGBSUBTRACTROW_NEON
MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
7)
#endif
#undef MATHROW_ANY
// Shuffle may want to work in place, so last16 method can not be used.
#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
const uint8* shuffler, int width) { \
int n = width & ~MASK; \
ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
ARGBTOY_C(src_argb + n * SBPP, \
dst_argb + n * BPP, shuffler, width & MASK); \
}
#ifdef HAS_ARGBSHUFFLEROW_SSE2
YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
ARGBShuffleRow_C, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
ARGBShuffleRow_C, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
ARGBShuffleRow_C, 4, 4, 15)
#endif
#ifdef HAS_ARGBSHUFFLEROW_NEON
YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
ARGBShuffleRow_C, 4, 4, 3)
#endif
#undef YANY
// Interpolate may want to work in place, so last16 method can not be used.
#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
ptrdiff_t src_stride_ptr, int width, \
int source_y_fraction) { \
int n = width & ~MASK; \
TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
n, source_y_fraction); \
TERP_C(dst_ptr + n * BPP, \
src_ptr + n * SBPP, src_stride_ptr, \
width & MASK, source_y_fraction); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
InterpolateRow_C, 1, 1, 32)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_SSE2
NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
InterpolateRow_C, 1, 1, 3)
#endif
#undef NANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

2286
third_party/libyuv/source/row_common.cc vendored Normal file

File diff suppressed because it is too large Load Diff

991
third_party/libyuv/source/row_mips.cc vendored Normal file
View File

@ -0,0 +1,991 @@
/*
* Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
__asm__ __volatile__ (
".set noreorder \n"
".set noat \n"
"slti $at, %[count], 8 \n"
"bne $at ,$zero, $last8 \n"
"xor $t8, %[src], %[dst] \n"
"andi $t8, $t8, 0x3 \n"
"bne $t8, $zero, unaligned \n"
"negu $a3, %[dst] \n"
// make dst/src aligned
"andi $a3, $a3, 0x3 \n"
"beq $a3, $zero, $chk16w \n"
// word-aligned now count is the remining bytes count
"subu %[count], %[count], $a3 \n"
"lwr $t8, 0(%[src]) \n"
"addu %[src], %[src], $a3 \n"
"swr $t8, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// Now the dst/src are mutually word-aligned with word-aligned addresses
"$chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, chk8w \n"
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n"
// t0 is the "past the end" address
// When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
// the "t0-32" address
// This means: for x=128 the last "safe" a1 address is "t0-160"
// Alternatively, for x=64 the last "safe" a1 address is "t0-96"
// we will use "pref 30,128(a1)", so "t0-160" is the limit
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line of src
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $loop16w \n"
"nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$loop16w: \n"
"pref 0, 96(%[src]) \n"
"lw $t0, 0(%[src]) \n"
"bgtz $v1, $skip_pref30_96 \n" // skip
"lw $t1, 4(%[src]) \n"
"pref 30, 96(%[dst]) \n" // continue
"$skip_pref30_96: \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lw $t0, 32(%[src]) \n"
"bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
"lw $t1, 36(%[src]) \n"
"pref 30, 128(%[dst]) \n" // set dest, addr 128
"$skip_pref30_128: \n"
"lw $t2, 40(%[src]) \n"
"lw $t3, 44(%[src]) \n"
"lw $t4, 48(%[src]) \n"
"lw $t5, 52(%[src]) \n"
"lw $t6, 56(%[src]) \n"
"lw $t7, 60(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst], %[dst], 64 \n" // adding 64 to dest
"sgtu $v1, %[dst], $t9 \n"
"bne %[dst], $a3, $loop16w \n"
" addiu %[src], %[src], 64 \n" // adding 64 to src
"move %[count], $t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count past 32-bytes
"beq %[count], $t8, chk1w \n"
// count=t8,no 32-byte chunk
" nop \n"
"lw $t0, 0(%[src]) \n"
"lw $t1, 4(%[src]) \n"
"lw $t2, 8(%[src]) \n"
"lw $t3, 12(%[src]) \n"
"lw $t4, 16(%[src]) \n"
"lw $t5, 20(%[src]) \n"
"lw $t6, 24(%[src]) \n"
"lw $t7, 28(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, $last8 \n"
" subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$wordCopy_loop: \n"
"lw $t3, 0(%[src]) \n"
// the first t3 may be equal t0 ... optimize?
"addiu %[src], %[src],4 \n"
"addiu %[dst], %[dst],4 \n"
"bne %[dst], $a3,$wordCopy_loop \n"
" sw $t3, -4(%[dst]) \n"
// For the last (<8) bytes
"$last8: \n"
"blez %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 -last dst address
"$last8loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst], $a3, $last8loop \n"
" sb $v1, -1(%[dst]) \n"
"leave: \n"
" j $ra \n"
" nop \n"
//
// UNALIGNED case
//
"unaligned: \n"
// got here with a3="negu a1"
"andi $a3, $a3, 0x3 \n" // a1 is word aligned?
"beqz $a3, $ua_chk16w \n"
" subu %[count], %[count], $a3 \n"
// bytes left after initial a3 bytes
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
"swr $v1, 0(%[dst]) \n"
"addu %[dst], %[dst], $a3 \n"
// below the dst will be word aligned (NOTE1)
"$ua_chk16w: \n"
"andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
// t8 is the byte count after 64-byte chunks
"beq %[count], $t8, ua_chk8w \n"
// if a2==t8, no 64-byte chunks
// There will be at most 1 32-byte chunk after it
"subu $a3, %[count], $t8 \n" // the reminder
// Here a3 counts bytes in 16w chunks
"addu $a3, %[dst], $a3 \n"
// Now a3 is the final dst after 64-byte chunks
"addu $t0, %[dst], %[count] \n" // t0 "past the end"
"subu $t9, $t0, 160 \n"
// t9 is the "last safe pref 30,128(a1)" address
"pref 0, 0(%[src]) \n" // first line of src
"pref 0, 32(%[src]) \n" // second line addr 32
"pref 0, 64(%[src]) \n"
"pref 30, 32(%[dst]) \n"
// safe, as we have at least 64 bytes ahead
// In case the a1 > t9 don't use "pref 30" at all
"sgtu $v1, %[dst], $t9 \n"
"bgtz $v1, $ua_loop16w \n"
// skip "pref 30,64(a1)" for too short arrays
" nop \n"
// otherwise, start with using pref30
"pref 30, 64(%[dst]) \n"
"$ua_loop16w: \n"
"pref 0, 96(%[src]) \n"
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"bgtz $v1, $ua_skip_pref30_96 \n"
" lwl $t1, 7(%[src]) \n"
"pref 30, 96(%[dst]) \n"
// continue setting up the dest, addr 96
"$ua_skip_pref30_96: \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"pref 0, 128(%[src]) \n"
// bring the next lines of src, addr 128
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"lwr $t0, 32(%[src]) \n"
"lwl $t0, 35(%[src]) \n"
"lwr $t1, 36(%[src]) \n"
"bgtz $v1, ua_skip_pref30_128 \n"
" lwl $t1, 39(%[src]) \n"
"pref 30, 128(%[dst]) \n"
// continue setting up the dest, addr 128
"ua_skip_pref30_128: \n"
"lwr $t2, 40(%[src]) \n"
"lwl $t2, 43(%[src]) \n"
"lwr $t3, 44(%[src]) \n"
"lwl $t3, 47(%[src]) \n"
"lwr $t4, 48(%[src]) \n"
"lwl $t4, 51(%[src]) \n"
"lwr $t5, 52(%[src]) \n"
"lwl $t5, 55(%[src]) \n"
"lwr $t6, 56(%[src]) \n"
"lwl $t6, 59(%[src]) \n"
"lwr $t7, 60(%[src]) \n"
"lwl $t7, 63(%[src]) \n"
"pref 0, 160(%[src]) \n"
// bring the next lines of src, addr 160
"sw $t0, 32(%[dst]) \n"
"sw $t1, 36(%[dst]) \n"
"sw $t2, 40(%[dst]) \n"
"sw $t3, 44(%[dst]) \n"
"sw $t4, 48(%[dst]) \n"
"sw $t5, 52(%[dst]) \n"
"sw $t6, 56(%[dst]) \n"
"sw $t7, 60(%[dst]) \n"
"addiu %[dst],%[dst],64 \n" // adding 64 to dest
"sgtu $v1,%[dst],$t9 \n"
"bne %[dst],$a3,$ua_loop16w \n"
" addiu %[src],%[src],64 \n" // adding 64 to src
"move %[count],$t8 \n"
// Here we have src and dest word-aligned but less than 64-bytes to go
"ua_chk8w: \n"
"pref 0, 0x0(%[src]) \n"
"andi $t8, %[count], 0x1f \n" // 32-byte chunk?
// the t8 is the reminder count
"beq %[count], $t8, $ua_chk1w \n"
// when count==t8, no 32-byte chunk
"lwr $t0, 0(%[src]) \n"
"lwl $t0, 3(%[src]) \n"
"lwr $t1, 4(%[src]) \n"
"lwl $t1, 7(%[src]) \n"
"lwr $t2, 8(%[src]) \n"
"lwl $t2, 11(%[src]) \n"
"lwr $t3, 12(%[src]) \n"
"lwl $t3, 15(%[src]) \n"
"lwr $t4, 16(%[src]) \n"
"lwl $t4, 19(%[src]) \n"
"lwr $t5, 20(%[src]) \n"
"lwl $t5, 23(%[src]) \n"
"lwr $t6, 24(%[src]) \n"
"lwl $t6, 27(%[src]) \n"
"lwr $t7, 28(%[src]) \n"
"lwl $t7, 31(%[src]) \n"
"addiu %[src], %[src], 32 \n"
"sw $t0, 0(%[dst]) \n"
"sw $t1, 4(%[dst]) \n"
"sw $t2, 8(%[dst]) \n"
"sw $t3, 12(%[dst]) \n"
"sw $t4, 16(%[dst]) \n"
"sw $t5, 20(%[dst]) \n"
"sw $t6, 24(%[dst]) \n"
"sw $t7, 28(%[dst]) \n"
"addiu %[dst], %[dst], 32 \n"
"$ua_chk1w: \n"
"andi %[count], $t8, 0x3 \n"
// now count is the reminder past 1w chunks
"beq %[count], $t8, ua_smallCopy \n"
"subu $a3, $t8, %[count] \n"
// a3 is count of bytes in 1w chunks
"addu $a3, %[dst], $a3 \n"
// now a3 is the dst address past the 1w chunks
// copying in words (4-byte chunks)
"$ua_wordCopy_loop: \n"
"lwr $v1, 0(%[src]) \n"
"lwl $v1, 3(%[src]) \n"
"addiu %[src], %[src], 4 \n"
"addiu %[dst], %[dst], 4 \n"
// note: dst=a1 is word aligned here, see NOTE1
"bne %[dst], $a3, $ua_wordCopy_loop \n"
" sw $v1,-4(%[dst]) \n"
// Now less than 4 bytes (value in count) left to copy
"ua_smallCopy: \n"
"beqz %[count], leave \n"
" addu $a3, %[dst], %[count] \n" // a3 = last dst address
"$ua_smallCopy_loop: \n"
"lb $v1, 0(%[src]) \n"
"addiu %[src], %[src], 1 \n"
"addiu %[dst], %[dst], 1 \n"
"bne %[dst],$a3,$ua_smallCopy_loop \n"
" sb $v1, -1(%[dst]) \n"
"j $ra \n"
" nop \n"
".set at \n"
".set reorder \n"
: [dst] "+r" (dst), [src] "+r" (src)
: [count] "r" (count)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
"t8", "t9", "a3", "v1", "at"
);
}
#endif // HAS_COPYROW_MIPS
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2)
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
"addiu %[src_uv], %[src_uv], 32 \n"
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"sw $t9, 0(%[dst_v]) \n"
"sw $t0, 0(%[dst_u]) \n"
"sw $t1, 4(%[dst_v]) \n"
"sw $t2, 4(%[dst_u]) \n"
"sw $t3, 8(%[dst_v]) \n"
"sw $t5, 8(%[dst_u]) \n"
"sw $t6, 12(%[dst_v]) \n"
"sw $t7, 12(%[dst_u]) \n"
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
uint8* dst_v, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lwr $t0, 0(%[src_uv]) \n"
"lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lwr $t1, 4(%[src_uv]) \n"
"lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lwr $t2, 8(%[src_uv]) \n"
"lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lwr $t3, 12(%[src_uv]) \n"
"lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lwr $t5, 16(%[src_uv]) \n"
"lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lwr $t6, 20(%[src_uv]) \n"
"lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lwr $t7, 24(%[src_uv]) \n"
"lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lwr $t8, 28(%[src_uv]) \n"
"lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"addiu %[src_uv], %[src_uv], 32 \n"
"swr $t9, 0(%[dst_v]) \n"
"swl $t9, 3(%[dst_v]) \n"
"swr $t0, 0(%[dst_u]) \n"
"swl $t0, 3(%[dst_u]) \n"
"swr $t1, 4(%[dst_v]) \n"
"swl $t1, 7(%[dst_v]) \n"
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n"
"swr $t3, 8(%[dst_v]) \n"
"swl $t3, 11(%[dst_v]) \n"
"swr $t5, 8(%[dst_u]) \n"
"swl $t5, 11(%[dst_u]) \n"
"swr $t6, 12(%[dst_v]) \n"
"swl $t6, 15(%[dst_v]) \n"
"swr $t7, 12(%[dst_u]) \n"
"swl $t7, 15(%[dst_u]) \n"
"addiu %[dst_u], %[dst_u], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_v], %[dst_v], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"andi $t5, %[width], 0xf \n"
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
".p2align 2 \n"
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t1, $t1 \n" // |6|7|4|5|
"wsbh $t2, $t2 \n" // |10|11|8|9|
"wsbh $t3, $t3 \n" // |14|15|12|13|
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
"addiu %[src], %[src], -16 \n"
"addiu $t4, $t4, -1 \n"
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
"bgtz $t4, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"beqz $t5, 3f \n"
" nop \n"
"2: \n"
"lbu $t0, -1(%[src]) \n"
"addiu $t5, $t5, -1 \n"
"addiu %[src], %[src], -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgez $t5, 2b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src] "+r" (src), [dst] "+r" (dst)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4", "t5"
);
}
void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x = 0;
int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"addu $t4, %[width], %[width] \n"
"srl %[x], %[width], 4 \n"
"andi %[y], %[width], 0xf \n"
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
"addiu %[src_uv], %[src_uv], -32 \n"
"addiu %[x], %[x], -1 \n"
"swr $t4, 0(%[dst_u]) \n"
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
"swr $t6, 0(%[dst_v]) \n"
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
"swr $t3, 4(%[dst_v]) \n"
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
"swr $t0, 8(%[dst_u]) \n"
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
"swr $t1, 8(%[dst_v]) \n"
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
"swr $t9, 12(%[dst_u]) \n"
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
"swr $t5, 12(%[dst_v]) \n"
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
"addiu %[dst_v], %[dst_v], 16 \n"
"bgtz %[x], 1b \n"
" addiu %[dst_u], %[dst_u], 16 \n"
"beqz %[y], 3f \n"
" nop \n"
"b 2f \n"
" nop \n"
"2: \n"
"lbu $t0, -2(%[src_uv]) \n"
"lbu $t1, -1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], -2 \n"
"addiu %[y], %[y], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[y], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
);
}
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
// t5 = | 0 | B0 | 0 | b0 |
// t4 = | 0 | B1 | 0 | b1 |
// t9 = | 0 | G0 | 0 | g0 |
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define I422ToTransientMipsRGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
"preceu.ph.qbr $t1, $t1 \n" \
"preceu.ph.qbr $t2, $t2 \n" \
"preceu.ph.qbra $t3, $t0 \n" \
"preceu.ph.qbla $t0, $t0 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t3, $t3, $s4 \n" \
"subu.ph $t0, $t0, $s4 \n" \
"mul.ph $t3, $t3, $s0 \n" \
"mul.ph $t0, $t0, $s0 \n" \
"shll.ph $t4, $t1, 0x7 \n" \
"subu.ph $t4, $t4, $t1 \n" \
"mul.ph $t6, $t1, $s1 \n" \
"mul.ph $t1, $t2, $s2 \n" \
"addq_s.ph $t5, $t4, $t3 \n" \
"addq_s.ph $t4, $t4, $t0 \n" \
"shra.ph $t5, $t5, 6 \n" \
"shra.ph $t4, $t4, 6 \n" \
"addiu %[u_buf], 2 \n" \
"addiu %[v_buf], 2 \n" \
"addu.ph $t6, $t6, $t1 \n" \
"mul.ph $t1, $t2, $s3 \n" \
"addu.ph $t9, $t6, $t3 \n" \
"addu.ph $t8, $t6, $t0 \n" \
"shra.ph $t9, $t9, 6 \n" \
"shra.ph $t8, $t8, 6 \n" \
"addu.ph $t2, $t1, $t3 \n" \
"addu.ph $t1, $t1, $t0 \n" \
"shra.ph $t2, $t2, 6 \n" \
"shra.ph $t1, $t1, 6 \n" \
"subu.ph $t5, $t5, $s5 \n" \
"subu.ph $t4, $t4, $s5 \n" \
"subu.ph $t9, $t9, $s5 \n" \
"subu.ph $t8, $t8, $s5 \n" \
"subu.ph $t2, $t2, $s5 \n" \
"subu.ph $t1, $t1, $s5 \n" \
"shll_s.ph $t5, $t5, 8 \n" \
"shll_s.ph $t4, $t4, 8 \n" \
"shll_s.ph $t9, $t9, 8 \n" \
"shll_s.ph $t8, $t8, 8 \n" \
"shll_s.ph $t2, $t2, 8 \n" \
"shll_s.ph $t1, $t1, 8 \n" \
"shra.ph $t5, $t5, 8 \n" \
"shra.ph $t4, $t4, 8 \n" \
"shra.ph $t9, $t9, 8 \n" \
"shra.ph $t8, $t8, 8 \n" \
"shra.ph $t2, $t2, 8 \n" \
"shra.ph $t1, $t1, 8 \n" \
"addu.ph $t5, $t5, $s5 \n" \
"addu.ph $t4, $t4, $s5 \n" \
"addu.ph $t9, $t9, $s5 \n" \
"addu.ph $t8, $t8, $s5 \n" \
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128| // clipping
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
"addiu %[width], -4 \n"
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
"precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
"precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
"precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
"precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
"precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
"or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
"or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
"sll $t1, $t1, 16 \n"
"sll $t2, $t2, 16 \n"
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
".p2align 2 \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
"lw $t4, 4(%[src_ptr]) \n"
"lw $t5, 4(%[src_ptr1]) \n"
"muleu_s.ph.qbl $t6, $t2, $t0 \n"
"muleu_s.ph.qbr $t7, $t2, $t0 \n"
"muleu_s.ph.qbl $t8, $t3, $t1 \n"
"muleu_s.ph.qbr $t9, $t3, $t1 \n"
"muleu_s.ph.qbl $t2, $t4, $t0 \n"
"muleu_s.ph.qbr $t3, $t4, $t0 \n"
"muleu_s.ph.qbl $t4, $t5, $t1 \n"
"muleu_s.ph.qbr $t5, $t5, $t1 \n"
"addq.ph $t6, $t6, $t8 \n"
"addq.ph $t7, $t7, $t9 \n"
"addq.ph $t2, $t2, $t4 \n"
"addq.ph $t3, $t3, $t5 \n"
"shra.ph $t6, $t6, 8 \n"
"shra.ph $t7, $t7, 8 \n"
"shra.ph $t2, $t2, 8 \n"
"shra.ph $t3, $t3, 8 \n"
"precr.qb.ph $t6, $t6, $t7 \n"
"precr.qb.ph $t2, $t2, $t3 \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[src_ptr1], %[src_ptr1], 8 \n"
"addiu %[dst_width], %[dst_width], -8 \n"
"sw $t6, 0(%[dst_ptr]) \n"
"sw $t2, 4(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[dst_ptr], %[dst_ptr], 8 \n"
".set pop \n"
: [dst_ptr] "+r" (dst_ptr),
[src_ptr1] "+r" (src_ptr1),
[src_ptr] "+r" (src_ptr),
[dst_width] "+r" (dst_width)
: [source_y_fraction] "r" (source_y_fraction),
[y0_fraction] "r" (y0_fraction),
[src_stride] "r" (src_stride)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
#endif // __mips_dsp_rev >= 2
#endif // defined(__mips__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

2844
third_party/libyuv/source/row_neon.cc vendored Normal file

File diff suppressed because it is too large Load Diff

6443
third_party/libyuv/source/row_posix.cc vendored Normal file

File diff suppressed because it is too large Load Diff

7284
third_party/libyuv/source/row_win.cc vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1716
third_party/libyuv/source/scale.cc vendored Normal file

File diff suppressed because it is too large Load Diff

1165
third_party/libyuv/source/scale_common.cc vendored Normal file

File diff suppressed because it is too large Load Diff

653
third_party/libyuv/source/scale_mips.cc vendored Normal file
View File

@ -0,0 +1,653 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/basic_types.h"
#include "third_party/libyuv/include/libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 4 \n" // iterations -> by 16
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
// TODO(fbarchard): Use odd pixels instead of even.
"precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
"precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
"precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
"precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t8, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t1, 8(%[dst]) \n"
"sw $t2, 12(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 16 \n"
"2: \n"
"andi $t9, %[dst_width], 0xf \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t0, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 2 \n"
"addiu $t9, $t9, -1 \n"
"sb $t0, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n" // iterations -> step 8
"bltz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 0(%[t]) \n" // |19|18|17|16|
"lw $t5, 4(%[t]) \n" // |23|22|21|20|
"lw $t6, 8(%[t]) \n" // |27|26|25|24|
"lw $t7, 12(%[t]) \n" // |31|30|29|28|
"addiu $t9, $t9, -1 \n"
"srl $t8, $t0, 16 \n" // |X|X|3|2|
"ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
"ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
"raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
"raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
"shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
"shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
"srl $t8, $t1, 16 \n" // |X|X|7|6|
"ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
"ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
"raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
"raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
"shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
"shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
"srl $t8, $t2, 16 \n" // |X|X|11|10|
"ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
"ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
"raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
"raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
"shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
"shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
"srl $t8, $t3, 16 \n" // |X|X|15|14|
"ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
"ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
"raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
"raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
"shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
"shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
"addiu %[src_ptr], %[src_ptr], 16 \n"
"addiu %[t], %[t], 16 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"sb $t1, 2(%[dst]) \n"
"sb $t5, 3(%[dst]) \n"
"sb $t2, 4(%[dst]) \n"
"sb $t6, 5(%[dst]) \n"
"sb $t3, 6(%[dst]) \n"
"sb $t7, 7(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 0x7 \n" // x = residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lwr $t1, 0(%[src_ptr]) \n"
"lwl $t1, 3(%[src_ptr]) \n"
"lwr $t2, 0(%[t]) \n"
"lwl $t2, 3(%[t]) \n"
"srl $t8, $t1, 16 \n"
"ins $t1, $t2, 16, 16 \n"
"ins $t2, $t8, 0, 16 \n"
"raddu.w.qb $t1, $t1 \n"
"raddu.w.qb $t2, $t2 \n"
"shra_r.w $t1, $t1, 2 \n"
"shra_r.w $t2, $t2, 2 \n"
"sb $t1, 0(%[dst]) \n"
"sb $t2, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -2 \n"
"addiu %[t], %[t], 4 \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 2 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst), [t] "+r" (t)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 3 \n"
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu $t9, $t9, -1 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t5, 4(%[dst]) \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 8 \n"
"2: \n"
"andi $t9, %[dst_width], 7 \n" // residue
"beqz $t9, 3f \n"
" nop \n"
"21: \n"
"lbu $t1, 0(%[src_ptr]) \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"addiu $t9, $t9, -1 \n"
"sb $t1, 0(%[dst]) \n"
"bgtz $t9, 21b \n"
" addiu %[dst], %[dst], 1 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst)
: [dst_width] "r" (dst_width)
: "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9"
);
}
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
const uint8* s3 = s2 + stride;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 4(%[s1]) \n" // |23|22|21|20|
"lw $t6, 4(%[s2]) \n" // |27|26|25|24|
"lw $t7, 4(%[s3]) \n" // |31|30|29|28|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
"raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
"raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
"raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"add $t4, $t4, $t5 \n"
"add $t6, $t6, $t7 \n"
"add $t4, $t4, $t6 \n"
"shra_r.w $t0, $t0, 4 \n"
"shra_r.w $t4, $t4, 4 \n"
"sb $t0, 0(%[dst]) \n"
"sb $t4, 1(%[dst]) \n"
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[s3], %[s3], 8 \n"
"addiu $t9, $t9, -1 \n"
"bgtz $t9, 1b \n"
" addiu %[dst], %[dst], 2 \n"
"beqz $t8, 2f \n"
" nop \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
"add $t0, $t0, $t1 \n"
"add $t1, $t2, $t3 \n"
"add $t0, $t0, $t1 \n"
"shra_r.w $t0, $t0, 4 \n"
"sb $t0, 0(%[dst]) \n"
"2: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[s1] "+r" (s1),
[s2] "+r" (s2),
[s3] "+r" (s3)
: [dst_width] "r" (dst_width)
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
"precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
"precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
"addiu %[dst_width], %[dst_width], -24 \n"
"ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
"ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
"ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
"ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
"packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
"prepend $t1, $t2, 8 \n" // |4|3|1|0|
"prepend $t3, $t4, 24 \n" // |15|13|12|11|
"prepend $t5, $t6, 8 \n" // |20|19|17|16|
"prepend $t7, $t8, 24 \n" // |31|29|28|27|
"sw $t1, 0(%[dst]) \n"
"sw $t0, 4(%[dst]) \n"
"sw $t3, 8(%[dst]) \n"
"sw $t5, 12(%[dst]) \n"
"sw $t9, 16(%[dst]) \n"
"sw $t7, 20(%[dst]) \n"
"bnez %[dst_width], 1b \n"
" addiu %[dst], %[dst], 24 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6","t7", "t8", "t9"
);
}
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
"andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t2, $t2, $t4 \n"
"addu.ph $t6, $t6, $t5 \n"
"sll $t5, $t0, 1 \n"
"add $t0, $t5, $t0 \n"
"shra_r.ph $t2, $t2, 2 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shll.ph $t4, $t2, 1 \n"
"addq.ph $t4, $t4, $t2 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.w $t0, $t0, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"shra_r.ph $t6, $t6, 2 \n"
"srl $t1, $t6, 16 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
"muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
"muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
"andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t1, $t1 \n"
"shra_r.w $t0, $t0, 1 \n"
"shra_r.w $t1, $t1, 1 \n"
"preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
"rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
"addu.ph $t4, $t4, $t3 \n"
"addu.ph $t6, $t6, $t5 \n"
"shra_r.ph $t6, $t6, 2 \n"
"shra_r.ph $t4, $t4, 2 \n"
"addu.ph $t6, $t6, $t4 \n"
"addiu %[src_ptr], %[src_ptr], 4 \n"
"shra_r.ph $t6, $t6, 1 \n"
"addu $t0, $t0, $t1 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"shra_r.w $t0, $t0, 1 \n"
"srl $t1, $t6, 16 \n"
"sb $t1, 0(%[d]) \n"
"sb $t0, 1(%[d]) \n"
"sb $t6, 2(%[d]) \n"
"bgtz %[dst_width], 1b \n"
" addiu %[d], %[d], 3 \n"
"3: \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[src_stride] "+r" (src_stride),
[d] "+r" (d),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6"
);
}
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
"wsbh $t0, $t0 \n" // |2|3|0|1|
"wsbh $t6, $t6 \n" // |26|27|24|25|
"srl $t0, $t0, 8 \n" // |X|2|3|0|
"srl $t3, $t3, 16 \n" // |X|X|15|14|
"srl $t5, $t5, 16 \n" // |X|X|23|22|
"srl $t7, $t7, 16 \n" // |X|X|31|30|
"ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
"ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
"ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
"ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
"prepend $t2, $t3, 24 \n" // |X|15|14|11|
"ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
"ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
"addiu %[src_ptr], %[src_ptr], 32 \n"
"addiu %[dst_width], %[dst_width], -12 \n"
"addiu $t8,%[dst_width], -12 \n"
"sw $t1, 0(%[dst]) \n"
"sw $t4, 4(%[dst]) \n"
"sw $t6, 8(%[dst]) \n"
"bgez $t8, 1b \n"
" addiu %[dst], %[dst], 12 \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst] "+r" (dst),
[dst_width] "+r" (dst_width)
:
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
const int c = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
"packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
"raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
"precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
"srl $t4, $t4, 2 \n" // t4 / 4
"srl $t6, $t6, 16 \n" // |0|0|S3|T3|
"raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
"addu $t6, $t5, $t6 \n"
"mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
"raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
"addu $t0, $t0, $t2 \n"
"mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[t], %[t], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t4, -1(%[dst_ptr]) \n"
"sb $t6, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[t] "+r" (t),
[dst_width] "+r" (dst_width)
: [c] "r" (c)
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
);
}
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
stride += stride;
const uint8* s2 = src_ptr + stride;
const int c1 = 0x1C71;
const int c2 = 0x2AAA;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
"lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
"lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
"lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
"packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
"raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
"packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
"raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
"sll $t8, $t5, 16 \n" // |R5|R4|0|0|
"raddu.w.qb $t8, $t8 \n" // R5+R4
"addu $t7, $t7, $t8 \n"
"srl $t8, $t5, 16 \n" // |0|0|R7|R6|
"raddu.w.qb $t8, $t8 \n" // R7 + R6
"addu $t6, $t6, $t8 \n"
"mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
"precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
"precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
"srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
"raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
"addu $t7, $t7, $t8 \n"
"mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
"sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
"raddu.w.qb $t0, $t0 \n"
"raddu.w.qb $t2, $t2 \n"
"raddu.w.qb $t4, $t4 \n"
"addu $t0, $t0, $t2 \n"
"addu $t0, $t0, $t4 \n"
"mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
"addiu %[src_ptr], %[src_ptr], 8 \n"
"addiu %[s1], %[s1], 8 \n"
"addiu %[s2], %[s2], 8 \n"
"addiu %[dst_width], %[dst_width], -3 \n"
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
"srl $t6, $t6, 16 \n"
"srl $t7, $t7, 16 \n"
"srl $t0, $t0, 16 \n"
"sb $t6, -1(%[dst_ptr]) \n"
"sb $t7, -2(%[dst_ptr]) \n"
"bgtz %[dst_width], 1b \n"
" sb $t0, -3(%[dst_ptr]) \n"
".set pop \n"
: [src_ptr] "+r" (src_ptr),
[dst_ptr] "+r" (dst_ptr),
[s1] "+r" (s1),
[s2] "+r" (s2),
[dst_width] "+r" (dst_width)
: [c1] "r" (c1), [c2] "r" (c2)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t6", "t7", "t8"
);
}
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

684
third_party/libyuv/source/scale_neon.cc vendored Normal file
View File

@ -0,0 +1,684 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/libyuv/include/libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
// Read 32x1 throw away even pixels, and write 16x1.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
".p2align 2 \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
"vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "q0", "q1", "memory", "cc"
);
}
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [r4]! \n"
"vld1.8 {q2}, [r5]! \n"
"vld1.8 {q3}, [%3]! \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q0, q3 \n"
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
"vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
);
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "d0", "d1", "d2", "d3", "memory", "cc"
);
}
void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
// when adding lines together
"vmovl.u8 q8, d4 \n"
"vmovl.u8 q9, d5 \n"
"vmovl.u8 q10, d6 \n"
"vmovl.u8 q11, d7 \n"
// 3 * line_0 + line_1
"vmlal.u8 q8, d0, d24 \n"
"vmlal.u8 q9, d1, d24 \n"
"vmlal.u8 q10, d2, d24 \n"
"vmlal.u8 q11, d3, d24 \n"
// (3 * line_0 + line_1) >> 2
"vqrshrn.u16 d0, q8, #2 \n"
"vqrshrn.u16 d1, q9, #2 \n"
"vqrshrn.u16 d2, q10, #2 \n"
"vqrshrn.u16 d3, q11, #2 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q8, d1 \n"
"vmlal.u8 q8, d0, d24 \n"
"vqrshrn.u16 d0, q8, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q8, d2 \n"
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
);
}
void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
"vrhadd.u8 q0, q0, q2 \n"
"vrhadd.u8 q1, q1, q3 \n"
// a0 = (src[0] * 3 + s[1] * 1) >> 2
"vmovl.u8 q3, d1 \n"
"vmlal.u8 q3, d0, d24 \n"
"vqrshrn.u16 d0, q3, #2 \n"
// a1 = (src[1] * 1 + s[2] * 1) >> 1
"vrhadd.u8 d1, d1, d2 \n"
// a2 = (src[2] * 1 + s[3] * 3) >> 2
"vmovl.u8 q3, d2 \n"
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
:
: "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
);
}
#define HAS_SCALEROWDOWN38_NEON
static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
static uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
// 32 -> 12
void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
"vst1.8 {d4}, [%1]! \n"
"vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(&kShuf38) // %3
: "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
);
}
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"vld1.8 {q15}, [%6] \n"
"add r4, %0, %3, lsl #1 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"vld4.8 {d16, d17, d18, d19}, [r4]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
"vtrn.u8 d16, d17 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
"vtrn.u8 d18, d19 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
"vpaddl.u8 q8, q8 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
"vpaddl.u8 d19, d19 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 q0, q8 \n"
"vadd.u16 d4, d3, d7 \n"
"vadd.u16 d4, d19 \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"vqrdmulh.s16 q2, q2, q13 \n"
"vmovn.u16 d4, q2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
"vmovl.u8 q9, d18 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
"vadd.u16 q1, q9 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q15 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.8 {d3}, [%1]! \n"
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2), // %5
"r"(&kMult38_Div9) // %6
: "r4", "q0", "q1", "q2", "q3", "q8", "q9",
"q13", "q14", "q15", "memory", "cc"
);
}
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// d0 = 00 10 01 11 02 12 03 13
// d1 = 40 50 41 51 42 52 43 53
"vtrn.u8 d0, d1 \n"
"vtrn.u8 d4, d5 \n"
// d2 = 20 30 21 31 22 32 23 33
// d3 = 60 70 61 71 62 72 63 73
"vtrn.u8 d2, d3 \n"
"vtrn.u8 d6, d7 \n"
// d0 = 00+10 01+11 02+12 03+13
// d2 = 40+50 41+51 42+52 43+53
"vpaddl.u8 q0, q0 \n"
"vpaddl.u8 q2, q2 \n"
// d3 = 60+70 61+71 62+72 63+73
"vpaddl.u8 d3, d3 \n"
"vpaddl.u8 d7, d7 \n"
// combine source lines
"vadd.u16 q0, q2 \n"
"vadd.u16 d4, d3, d7 \n"
// dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
"vqrshrn.u16 d4, q2, #2 \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"vmovl.u8 q1, d2 \n"
"vmovl.u8 q3, d6 \n"
// combine source lines
"vadd.u16 q1, q3 \n"
// d4 = xx 20 xx 30 xx 22 xx 32
// d5 = xx 21 xx 31 xx 23 xx 33
"vtrn.u32 d2, d3 \n"
// d4 = xx 20 xx 21 xx 22 xx 23
// d5 = xx 30 xx 31 xx 32 xx 33
"vtrn.u16 d2, d3 \n"
// 0+1+2, 3+4+5
"vadd.u16 q0, q1 \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"vqrdmulh.s16 q0, q0, q13 \n"
// Align for table lookup, vtbl requires registers to
// be adjacent
"vmov.u8 d2, d4 \n"
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
"vst1.8 {d3}, [%1]! \n"
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_stride) // %3
: "r"(&kMult38_Div6), // %4
"r"(&kShuf38_2) // %5
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
);
}
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
asm volatile (
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
"cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n"
"rsb %4, #256 \n"
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
"vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.8 {q0}, [%1]! \n"
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
"vld1.8 {q1}, [%1]! \n"
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_stride), // %2
"+r"(dst_width), // %3
"+r"(source_y_fraction) // %4
:
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
"vld2.32 {q0, q1}, [%0]! \n"
"vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"vst1.8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
"vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
"vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n"
"vld1.32 {d0[1]}, [%0], r12 \n"
"vld1.32 {d1[0]}, [%0], r12 \n"
"vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"(src_stepx) // %3
: "memory", "cc", "r12", "q0"
);
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
"vld1.8 {d1}, [%1], r12 \n"
"vld1.8 {d2}, [%0], r12 \n"
"vld1.8 {d3}, [%1], r12 \n"
"vld1.8 {d4}, [%0], r12 \n"
"vld1.8 {d5}, [%1], r12 \n"
"vld1.8 {d6}, [%0], r12 \n"
"vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n"
"vaddl.u8 q2, d4, d5 \n"
"vaddl.u8 q3, d6, d7 \n"
"vswp.8 d1, d2 \n" // ab_cd -> ac_bd
"vswp.8 d5, d6 \n" // ef_gh -> eg_fh
"vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
"vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
"+r"(dst_width) // %3
: "r"(src_stepx) // %4
: "memory", "cc", "r12", "q0", "q1", "q2", "q3"
);
}
#endif // __ARM_NEON__
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

1315
third_party/libyuv/source/scale_posix.cc vendored Normal file

File diff suppressed because it is too large Load Diff

1320
third_party/libyuv/source/scale_win.cc vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -873,8 +873,16 @@ int main_loop(int argc, const char **argv_) {
}
if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
#if CONFIG_LIBYUV
vpx_image_scale(img, scaled_img, kFilterBox);
img = scaled_img;
#else
fprintf(stderr, "Failed to scale output frame: %s.\n"
"Scaling is disabled in this configuration. "
"To enable scaling, configure with --enable-libyuv\n",
vpx_codec_error(&decoder));
return EXIT_FAILURE;
#endif
}
}

View File

@ -1268,6 +1268,7 @@ static void encode_frame(struct stream_state *stream,
fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
exit(EXIT_FAILURE);
}
#if CONFIG_LIBYUV
if (!stream->img)
stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
cfg->g_w, cfg->g_h, 16);
@ -1283,8 +1284,15 @@ static void encode_frame(struct stream_state *stream,
stream->img->stride[VPX_PLANE_V],
stream->img->d_w, stream->img->d_h,
kFilterBox);
img = stream->img;
#else
stream->encoder.err = 1;
ctx_exit_on_error(&stream->encoder,
"Stream %d: Failed to encode frame.\n"
"Scaling disabled in this configuration. \n"
"To enable, configure with --enable-libyuv\n",
stream->index);
#endif
}
vpx_usec_timer_start(&timer);