libyuv: update to b8ddb5a2

Fixes color issue when scaling without breaking mingw.

BUG=https://bugs.chromium.org/p/libyuv/issues/detail?id=605
BUG=https://bugs.chromium.org/p/webm/issues/detail?id=1252

Change-Id: I3920c5664def7ae7a23f60fb160d26d23bc86a27
This commit is contained in:
Jim Bankoski 2016-06-28 07:26:07 -07:00 committed by James Bankoski
parent b34705f64f
commit b8f83282f8
55 changed files with 7011 additions and 8224 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1456 Version: b8ddb5a2
License: BSD License: BSD
License File: LICENSE License File: LICENSE
@ -13,3 +13,13 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams. in order to encode multiple resolution bit streams.
Local Modifications: Local Modifications:
Remove files unnecessary to libvpx build.
rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \
LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \
all.gyp build_overrides/ chromium/ codereview.settings docs/ \
download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \
include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \
libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \
third_party/ tools/ unit_test/ util/ winarm.mk

View File

@ -12,10 +12,8 @@
#define INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
// TODO(fbarchard): Remove the following headers includes.
#include "libyuv/convert_from.h" #include "libyuv/rotate.h" // For enum RotationMode.
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {

View File

@ -12,10 +12,8 @@
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
// TODO(fbarchard): Remove the following headers includes
#include "libyuv/convert_from.h" #include "libyuv/rotate.h" // For enum RotationMode.
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
// TODO(fbarchard): This set of functions should exactly match convert.h // TODO(fbarchard): This set of functions should exactly match convert.h
// TODO(fbarchard): Add tests. Create random content of right size and convert // TODO(fbarchard): Add tests. Create random content of right size and convert
@ -60,6 +58,22 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert I411 to ARGB. // Convert I411 to ARGB.
LIBYUV_API LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y, int I411ToARGB(const uint8* src_y, int src_stride_y,
@ -68,6 +82,24 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. // Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y, int I400ToARGB(const uint8* src_y, int src_stride_y,
@ -131,6 +163,54 @@ int J422ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// BGRA little endian (argb in memory) to ARGB. // BGRA little endian (argb in memory) to ARGB.
LIBYUV_API LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame, int BGRAToARGB(const uint8* src_frame, int src_stride_frame,

View File

@ -56,8 +56,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
// TODO(fbarchard): I420ToM420
LIBYUV_API LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y, int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,

View File

@ -18,9 +18,8 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization. // Internal flag to indicate cpuid requires initialization.
#define kCpuInit 0x1 static const int kCpuInitialized = 0x1;
// These flags are only valid on ARM processors. // These flags are only valid on ARM processors.
static const int kCpuHasARM = 0x2; static const int kCpuHasARM = 0x2;
@ -37,12 +36,12 @@ static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400; static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800; static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000; static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. // 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors. // These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000; static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasMIPS_DSP = 0x20000; static const int kCpuHasDSPR2 = 0x20000;
static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init. // Internal function used to auto-init.
LIBYUV_API LIBYUV_API
@ -57,13 +56,13 @@ int ArmCpuCaps(const char* cpuinfo_name);
// returns non-zero if instruction set is detected // returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) { static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_; LIBYUV_API extern int cpu_info_;
return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag; return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
} }
// For testing, allow CPU flags to be disabled. // For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations. // MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(0) to disable all cpu specific optimizations. // MaskCpuFlags(1) to disable all cpu specific optimizations.
LIBYUV_API LIBYUV_API
void MaskCpuFlags(int enable_flags); void MaskCpuFlags(int enable_flags);

View File

@ -145,13 +145,6 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height); int width, int height);
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// I422ToARGB is in convert_argb.h // I422ToARGB is in convert_argb.h
// Convert I422 to BGRA. // Convert I422 to BGRA.
LIBYUV_API LIBYUV_API
@ -177,6 +170,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
uint8* dst_rgba, int dst_stride_rgba, uint8* dst_rgba, int dst_stride_rgba,
int width, int height); int width, int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
// Draw a rectangle into I420. // Draw a rectangle into I420.
LIBYUV_API LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y, int I420Rect(uint8* dst_y, int dst_stride_y,
@ -281,13 +282,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Copy ARGB to ARGB. // Copy Alpha channel of ARGB to alpha of ARGB.
LIBYUV_API LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Copy ARGB to ARGB. // Extract the alpha channel from ARGB.
LIBYUV_API
int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_a, int dst_stride_a,
int width, int height);
// Copy Y channel to Alpha of ARGB.
LIBYUV_API LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
@ -301,6 +308,7 @@ LIBYUV_API
ARGBBlendRow GetARGBBlend(); ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination. // Alpha Blend ARGB images and store to destination.
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255. // Alpha of destination is set to 255.
LIBYUV_API LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@ -308,6 +316,31 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@ -357,12 +390,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert MJPG to ARGB.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* argb, int argb_stride,
int w, int h, int dw, int dh);
// Internal function - do not call directly. // Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum // Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur. // of all values above and to the left of the entry. Used by ARGBBlur.
@ -389,22 +416,49 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value); int width, int height, uint32 value);
// Interpolate between two ARGB images using specified amount of interpolation // Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination. // (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src_argb0 and 99% src_argb1. // and 255 means 1% src0 and 99% src1.
// Internally uses ARGBScale bilinear filtering. LIBYUV_API
// Caveat: This function will write up to 16 bytes beyond the end of dst_argb. int InterpolatePlane(const uint8* src0, int src_stride0,
const uint8* src1, int src_stride1,
uint8* dst, int dst_stride,
int width, int height, int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
LIBYUV_API LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1, const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation); int width, int height, int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
int I420Interpolate(const uint8* src0_y, int src0_stride_y,
const uint8* src0_u, int src0_stride_u,
const uint8* src0_v, int src0_stride_v,
const uint8* src1_y, int src1_stride_y,
const uint8* src1_u, int src1_stride_u,
const uint8* src1_v, int src1_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \ #if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__)) (defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// The following are available on all x86 platforms: // The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

View File

@ -22,53 +22,24 @@ extern "C" {
(defined(__i386__) && !defined(__SSE2__)) (defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
// Visual C 2012 required for AVX2. #if defined(__has_feature)
#if defined(_M_IX86) && !defined(__clang__) && \ #if __has_feature(memory_sanitizer)
defined(_MSC_VER) && _MSC_VER >= 1700 #define LIBYUV_DISABLE_X86
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif #endif
#endif #endif
// The following are available for Visual C and clangcl 32 bit:
// The following are available for Visual C: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2 #define HAS_TRANSPOSEUVWX8_SSE2
#endif #endif
// The following are available for GCC but not NaCL: // The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEWX8_SSSE3
#endif #endif
// The following are available for 32 bit GCC:
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for 64 bit GCC but not NaCL: // The following are available for 64 bit GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__) defined(__x86_64__)
@ -85,8 +56,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \ defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2) defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_MIPS_DSPR2 #define HAS_TRANSPOSEWX8_DSPR2
#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2 #define HAS_TRANSPOSEUVWX8_DSPR2
#endif // defined(__mips__) #endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride, void TransposeWxH_C(const uint8* src, int src_stride,
@ -100,7 +71,9 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride, void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
@ -109,8 +82,8 @@ void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width); uint8* dst, int dst_stride, int width);
void TransposeUVWxH_C(const uint8* src, int src_stride, void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
@ -126,9 +99,19 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
void TransposeUVWx8_NEON(const uint8* src, int src_stride, void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width); uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width); uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"

File diff suppressed because it is too large Load Diff

View File

@ -35,7 +35,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int clip_x, int clip_y, int clip_width, int clip_height, int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering); enum FilterMode filtering);
// TODO(fbarchard): Implement this.
// Scale with YUV conversion to ARGB and clipping. // Scale with YUV conversion to ARGB and clipping.
LIBYUV_API LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

View File

@ -23,6 +23,26 @@ extern "C" {
(defined(__i386__) && !defined(__SSE2__)) (defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
#define GCC_HAS_AVX2 1
#endif // GNUC >= 4.7
#endif // __GNUC__
// clang >= 3.4.0 required for AVX2.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
#define CLANG_HAS_AVX2 1
#endif // clang >= 3.4
#endif // __clang__
// Visual C 2012 required for AVX2. // Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \ #if defined(_M_IX86) && !defined(__clang__) && \
@ -42,24 +62,23 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3 #define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSE2 #define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSE2 #define HAS_SCALEROWDOWN4_SSSE3
#define HAS_SCALEADDROW_SSE2
#endif #endif
// The following are available on VS2012: // The following are available on all x86 platforms, but
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) // require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEADDROW_AVX2 #define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2 #define HAS_SCALEROWDOWN4_AVX2
#endif #endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@ -77,10 +96,10 @@ extern "C" {
// The following are available on Mips platforms: // The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_MIPS_DSPR2 #define HAS_SCALEROWDOWN2_DSPR2
#define HAS_SCALEROWDOWN4_MIPS_DSPR2 #define HAS_SCALEROWDOWN4_DSPR2
#define HAS_SCALEROWDOWN34_MIPS_DSPR2 #define HAS_SCALEROWDOWN34_DSPR2
#define HAS_SCALEROWDOWN38_MIPS_DSPR2 #define HAS_SCALEROWDOWN38_DSPR2
#endif #endif
// Scale ARGB vertically with bilinear interpolation. // Scale ARGB vertically with bilinear interpolation.
@ -133,6 +152,8 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width); uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width); uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
@ -214,22 +235,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
// Specialized scalers for x86. // Specialized scalers for x86.
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -251,22 +272,26 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -418,6 +443,8 @@ void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
@ -447,28 +474,26 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width);
uint8* d, int dst_width); void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width);
uint8* d, int dst_width); void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width);
uint8* dst, int dst_width); void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width);
uint8* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, uint8* dst_ptr, int dst_width);
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1456 #define LIBYUV_VERSION 1601
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -62,7 +62,7 @@ enum FourCC {
// 2 Secondary YUV formats: row biplanar. // 2 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'), FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@ -90,7 +90,8 @@ enum FourCC {
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
FOURCC_J420 = FOURCC('J', '4', '2', '0'), FOURCC_J420 = FOURCC('J', '4', '2', '0'),
FOURCC_J400 = FOURCC('J', '4', '0', '0'), FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@ -150,6 +151,7 @@ enum FourCCBpp {
FOURCC_BPP_YU12 = 12, FOURCC_BPP_YU12 = 12,
FOURCC_BPP_J420 = 12, FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8, FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0, FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12, FOURCC_BPP_IYUV = 12,

View File

@ -17,6 +17,7 @@
#endif #endif
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/video_common.h" #include "libyuv/video_common.h"
@ -26,30 +27,13 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#ifdef VISUALC_HAS_AVX2
#define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended. // hash seed of 5381 recommended.
LIBYUV_API LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768; const int kBlockSize = 1 << 15; // 32768;
int remainder; int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41) #if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) { if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41; HashDjb2_SSE = HashDjb2_SSE41;
@ -127,23 +111,6 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
return fourcc; return fourcc;
} }
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
#ifdef VISUALC_HAS_AVX2
#define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif
// TODO(fbarchard): Refactor into row function. // TODO(fbarchard): Refactor into row function.
LIBYUV_API LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,

View File

@ -10,6 +10,8 @@
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
extern "C" { extern "C" {

View File

@ -9,6 +9,8 @@
*/ */
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#ifdef __cplusplus #ifdef __cplusplus
@ -16,11 +18,13 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse; uint32 sse;
asm volatile ( // NOLINT asm volatile (
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
@ -54,15 +58,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"+r"(count), // %2 "+r"(count), // %2
"=g"(sse) // %3 "=g"(sse) // %3
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); // NOLINT );
return sse; return sse;
} }
#endif // defined(__x86_64__) || defined(__i386__)
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = { static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15 0x0c3525e1, // 33 ^ 15
@ -91,7 +90,7 @@ static uvec32 kHashMul3 = {
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash; uint32 hash;
asm volatile ( // NOLINT asm volatile (
"movd %2,%%xmm0 \n" "movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"
@ -140,7 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"m"(kHashMul3) // %8 "m"(kHashMul3) // %8
: "memory", "cc" : "memory", "cc"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); // NOLINT );
return hash; return hash;
} }
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

View File

@ -9,6 +9,8 @@
*/ */
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#ifdef __cplusplus #ifdef __cplusplus
@ -27,7 +29,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"

View File

@ -9,6 +9,8 @@
*/ */
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#ifdef __cplusplus #ifdef __cplusplus
@ -26,7 +28,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v17.16b, v17.16b, v17.16b \n" "eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"

View File

@ -9,6 +9,8 @@
*/ */
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#ifdef __cplusplus #ifdef __cplusplus
@ -16,9 +18,8 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// This module is for Visual C x86. // This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked) __declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@ -100,41 +101,32 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
} }
#endif // _MSC_VER >= 1700 #endif // _MSC_VER >= 1700
#define HAS_HASHDJB2_SSE41 uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 uvec32 kHashMul0 = {
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15 0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14 0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13 0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12 0x4f5f0981, // 33 ^ 12
}; };
static uvec32 kHashMul1 = { uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11 0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10 0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9 0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8 0x747c7101, // 33 ^ 8
}; };
static uvec32 kHashMul2 = { uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7 0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6 0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5 0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4 0x00121881, // 33 ^ 4
}; };
static uvec32 kHashMul3 = { uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3 0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2 0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1 0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0 0x00000001, // 33 ^ 0
}; };
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm { __asm {
@ -143,30 +135,30 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33 movdqa xmm6, xmmword ptr kHash16x33
wloop: wloop:
movdqu xmm1, [eax] // src[0-15] movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16] lea eax, [eax + 16]
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, kHashMul0 movdqa xmm5, xmmword ptr kHashMul0
movdqa xmm2, xmm1 movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7] punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2 movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3] punpcklwd xmm3, xmm7 // src[0-3]
pmulld(0xdd) // pmulld xmm3, xmm5 pmulld xmm3, xmm5
movdqa xmm5, kHashMul1 movdqa xmm5, xmmword ptr kHashMul1
movdqa xmm4, xmm2 movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7] punpckhwd xmm4, xmm7 // src[4-7]
pmulld(0xe5) // pmulld xmm4, xmm5 pmulld xmm4, xmm5
movdqa xmm5, kHashMul2 movdqa xmm5, xmmword ptr kHashMul2
punpckhbw xmm1, xmm7 // src[8-15] punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1 movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11] punpcklwd xmm2, xmm7 // src[8-11]
pmulld(0xd5) // pmulld xmm2, xmm5 pmulld xmm2, xmm5
movdqa xmm5, kHashMul3 movdqa xmm5, xmmword ptr kHashMul3
punpckhwd xmm1, xmm7 // src[12-15] punpckhwd xmm1, xmm7 // src[12-15]
pmulld(0xcd) // pmulld xmm1, xmm5 pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm1, xmm3 paddd xmm1, xmm3
@ -191,36 +183,37 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count mov ecx, [esp + 8] // count
movd xmm0, [esp + 12] // seed vmovd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
wloop: wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3] vpmovzxbd xmm3, [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] vpmovzxbd xmm4, [eax + 4] // src[4-7]
pmulld xmm3, kHashMul0 vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] vpmovzxbd xmm2, [eax + 8] // src[8-11]
pmulld xmm4, kHashMul1 vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] vpmovzxbd xmm1, [eax + 12] // src[12-15]
pmulld xmm2, kHashMul2 vpmulld xmm2, xmm2, xmmword ptr kHashMul2
lea eax, [eax + 16] lea eax, [eax + 16]
pmulld xmm1, kHashMul3 vpmulld xmm1, xmm1, xmmword ptr kHashMul3
paddd xmm3, xmm4 // add 16 results vpaddd xmm3, xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 vpaddd xmm1, xmm1, xmm2
paddd xmm1, xmm3 vpaddd xmm1, xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2 vpaddd xmm1, xmm1,xmm2
pshufd xmm2, xmm1, 0x01 vpshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2 vpaddd xmm1, xmm1, xmm2
paddd xmm0, xmm1 vpaddd xmm0, xmm0, xmm1
sub ecx, 16 sub ecx, 16
jg wloop jg wloop
movd eax, xmm0 // return hash vmovd eax, xmm0 // return hash
vzeroupper
ret ret
} }
} }
#endif // _MSC_VER >= 1700 #endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -245,8 +245,8 @@ static int X420ToI420(const uint8* src_y,
int y; int y;
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
SplitUVRow_C; int width) = SplitUVRow_C;
if (!src_y || !src_uv || if (!src_y || !src_uv ||
!dst_y || !dst_u || !dst_v || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -303,14 +303,14 @@ static int X420ToI420(const uint8* src_y,
} }
} }
#endif #endif
#if defined(HAS_SPLITUVROW_MIPS_DSPR2) #if defined(HAS_SPLITUVROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; SplitUVRow = SplitUVRow_Any_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) { if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_MIPS_DSPR2; SplitUVRow = SplitUVRow_DSPR2;
} }
} }
#endif #endif
@ -390,9 +390,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
int width, int height) { int width, int height) {
int y; int y;
void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
void (*YUY2ToYRow)(const uint8* src_yuy2, void (*YUY2ToYRow)(const uint8* src_yuy2,
uint8* dst_y, int pix) = YUY2ToYRow_C; uint8* dst_y, int width) = YUY2ToYRow_C;
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
@ -455,9 +455,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
int width, int height) { int width, int height) {
int y; int y;
void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
void (*UYVYToYRow)(const uint8* src_uyvy, void (*UYVYToYRow)(const uint8* src_uyvy,
uint8* dst_y, int pix) = UYVYToYRow_C; uint8* dst_y, int width) = UYVYToYRow_C;
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
@ -521,7 +521,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
int y; int y;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
if (!src_argb || if (!src_argb ||
!dst_y || !dst_u || !dst_v || !dst_y || !dst_u || !dst_v ||
@ -597,7 +597,7 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
int y; int y;
void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) = void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
BGRAToYRow_C; BGRAToYRow_C;
if (!src_bgra || if (!src_bgra ||
!dst_y || !dst_u || !dst_v || !dst_y || !dst_u || !dst_v ||
@ -663,7 +663,7 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
int y; int y;
void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) = void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
ABGRToYRow_C; ABGRToYRow_C;
if (!src_abgr || if (!src_abgr ||
!dst_y || !dst_u || !dst_v || !dst_y || !dst_u || !dst_v ||
@ -729,7 +729,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
int y; int y;
void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) = void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
RGBAToYRow_C; RGBAToYRow_C;
if (!src_rgba || if (!src_rgba ||
!dst_y || !dst_u || !dst_v || !dst_y || !dst_u || !dst_v ||
@ -796,14 +796,14 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
#if defined(HAS_RGB24TOYROW_NEON) #if defined(HAS_RGB24TOYROW_NEON)
void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
RGB24ToYRow_C; RGB24ToYRow_C;
#else #else
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB24ToARGBRow_C; RGB24ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
#endif #endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v || if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
@ -910,14 +910,14 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
#if defined(HAS_RAWTOYROW_NEON) #if defined(HAS_RAWTOYROW_NEON)
void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
RAWToYRow_C; RAWToYRow_C;
#else #else
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RAWToARGBRow_C; RAWToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
#endif #endif
if (!src_raw || !dst_y || !dst_u || !dst_v || if (!src_raw || !dst_y || !dst_u || !dst_v ||
@ -1024,14 +1024,14 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
#if defined(HAS_RGB565TOYROW_NEON) #if defined(HAS_RGB565TOYROW_NEON)
void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565, void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) = void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
RGB565ToYRow_C; RGB565ToYRow_C;
#else #else
void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB565ToARGBRow_C; RGB565ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
#endif #endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v || if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
@ -1146,14 +1146,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#if defined(HAS_ARGB1555TOYROW_NEON) #if defined(HAS_ARGB1555TOYROW_NEON)
void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
ARGB1555ToYRow_C; ARGB1555ToYRow_C;
#else #else
void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
ARGB1555ToARGBRow_C; ARGB1555ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
#endif #endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v || if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
@ -1270,14 +1270,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
#if defined(HAS_ARGB4444TOYROW_NEON) #if defined(HAS_ARGB4444TOYROW_NEON)
void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
ARGB4444ToYRow_C; ARGB4444ToYRow_C;
#else #else
void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
ARGB4444ToARGBRow_C; ARGB4444ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
#endif #endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v || if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||

View File

@ -14,6 +14,7 @@
#ifdef HAVE_JPEG #ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h" #include "libyuv/mjpeg_decoder.h"
#endif #endif
#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
#include "libyuv/rotate_argb.h" #include "libyuv/rotate_argb.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/video_common.h" #include "libyuv/video_common.h"
@ -44,18 +45,347 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Convert I444 to ARGB. // Convert I422 to ARGB with matrix
static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to ARGB.
LIBYUV_API LIBYUV_API
int I444ToARGB(const uint8* src_y, int src_stride_y, int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height);
}
// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
}
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvH709Constants,
width, height);
}
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I422 to ARGB with matrix
static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
// Convert I422 to ARGB.
LIBYUV_API
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height);
}
// Convert I422 to ABGR.
LIBYUV_API
int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
}
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvH709Constants,
width, height);
}
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I444 to ARGB with matrix
static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y; int y;
void (*I444ToARGBRow)(const uint8* y_buf, void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I444ToARGBRow_C; int width) = I444ToARGBRow_C;
if (!src_y || !src_u || !src_v || if (!src_y || !src_u || !src_v ||
!dst_argb || !dst_argb ||
@ -103,7 +433,7 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
@ -112,81 +442,49 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I422 to ARGB. // Convert I444 to ARGB.
LIBYUV_API LIBYUV_API
int I422ToARGB(const uint8* src_y, int src_stride_y, int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; return I444ToARGBMatrix(src_y, src_stride_y,
void (*I422ToARGBRow)(const uint8* y_buf, src_u, src_stride_u,
const uint8* u_buf, src_v, src_stride_v,
const uint8* v_buf, dst_argb, dst_stride_argb,
uint8* rgb_buf, &kYuvI601Constants,
int width) = I422ToARGBRow_C; width, height);
if (!src_y || !src_u || !src_v || }
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) { // Convert I444 to ABGR.
I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); LIBYUV_API
dst_argb += dst_stride_argb; int I444ToABGR(const uint8* src_y, int src_stride_y,
src_y += src_stride_y; const uint8* src_u, int src_stride_u,
src_u += src_stride_u; const uint8* src_v, int src_stride_v,
src_v += src_stride_v; uint8* dst_abgr, int dst_stride_abgr,
} int width, int height) {
return 0; return I444ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
} }
// Convert I411 to ARGB. // Convert I411 to ARGB.
@ -201,6 +499,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I411ToARGBRow_C; int width) = I411ToARGBRow_C;
if (!src_y || !src_u || !src_v || if (!src_y || !src_u || !src_v ||
!dst_argb || !dst_argb ||
@ -248,7 +547,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
src_u += src_stride_u; src_u += src_stride_u;
@ -257,6 +556,143 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I420 with Alpha to preattenuated ARGB.
static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height, int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) = I422AlphaToARGBRow_C;
void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBAttenuateRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
if (attenuate) {
ARGBAttenuateRow(dst_argb, dst_argb, width);
}
dst_argb += dst_stride_argb;
src_a += src_stride_a;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 with Alpha to ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate) {
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
src_a, src_stride_a,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height, attenuate);
}
// Convert I420 with Alpha to ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate) {
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
src_a, src_stride_a,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height, attenuate);
}
// Convert I400 to ARGB. // Convert I400 to ARGB.
LIBYUV_API LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y, int I400ToARGB(const uint8* src_y, int src_stride_y,
@ -322,7 +758,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
J400ToARGBRow_C; J400ToARGBRow_C;
if (!src_y || !dst_argb || if (!src_y || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -449,7 +885,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RGB24ToARGBRow_C; RGB24ToARGBRow_C;
if (!src_rgb24 || !dst_argb || if (!src_rgb24 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -499,7 +935,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
RAWToARGBRow_C; RAWToARGBRow_C;
if (!src_raw || !dst_argb || if (!src_raw || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -549,7 +985,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
RGB565ToARGBRow_C; RGB565ToARGBRow_C;
if (!src_rgb565 || !dst_argb || if (!src_rgb565 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -608,7 +1044,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
int pix) = ARGB1555ToARGBRow_C; int width) = ARGB1555ToARGBRow_C;
if (!src_argb1555 || !dst_argb || if (!src_argb1555 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
@ -666,7 +1102,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
int pix) = ARGB4444ToARGBRow_C; int width) = ARGB4444ToARGBRow_C;
if (!src_argb4444 || !dst_argb || if (!src_argb4444 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
@ -727,6 +1163,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
void (*NV12ToARGBRow)(const uint8* y_buf, void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C; int width) = NV12ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb || if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -764,7 +1201,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, width); NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -784,6 +1221,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
void (*NV21ToARGBRow)(const uint8* y_buf, void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV21ToARGBRow_C; int width) = NV21ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb || if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -821,7 +1259,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_uv, dst_argb, width); NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -840,6 +1278,7 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
void (*NV12ToARGBRow)(const uint8* y_buf, void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf, const uint8* uv_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C; int width) = NV12ToARGBRow_C;
if (!src_m420 || !dst_argb || if (!src_m420 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -877,14 +1316,16 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
&kYuvI601Constants, width);
NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
dst_argb + dst_stride_argb, width); dst_argb + dst_stride_argb, &kYuvI601Constants, width);
dst_argb += dst_stride_argb * 2; dst_argb += dst_stride_argb * 2;
src_m420 += src_stride_m420 * 3; src_m420 += src_stride_m420 * 3;
} }
if (height & 1) { if (height & 1) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
&kYuvI601Constants, width);
} }
return 0; return 0;
} }
@ -895,7 +1336,10 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = void (*YUY2ToARGBRow)(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) =
YUY2ToARGBRow_C; YUY2ToARGBRow_C;
if (!src_yuy2 || !dst_argb || if (!src_yuy2 || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -939,7 +1383,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
} }
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, width); YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
src_yuy2 += src_stride_yuy2; src_yuy2 += src_stride_yuy2;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
@ -952,7 +1396,10 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = void (*UYVYToARGBRow)(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) =
UYVYToARGBRow_C; UYVYToARGBRow_C;
if (!src_uyvy || !dst_argb || if (!src_uyvy || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -996,159 +1443,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
} }
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, width); UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
src_uyvy += src_stride_uyvy; src_uyvy += src_stride_uyvy;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
return 0; return 0;
} }
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -445,221 +445,24 @@ int I420ToNV21(const uint8* src_y, int src_stride_y,
return I420ToNV12(src_y, src_stride_y, return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v, src_v, src_stride_v,
src_u, src_stride_u, src_u, src_stride_u,
dst_y, src_stride_y, dst_y, dst_stride_y,
dst_vu, dst_stride_vu, dst_vu, dst_stride_vu,
width, height); width, height);
} }
// Convert I420 to ARGB. // Convert I422 to RGBA with matrix
LIBYUV_API static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
int I420ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u,
const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v,
const uint8* src_v, int src_stride_v, uint8* dst_rgba, int dst_stride_rgba,
uint8* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants,
int width, int height) { int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to BGRA.
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
int y;
void (*I422ToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToBGRARow_C;
if (!src_y || !src_u || !src_v || !dst_bgra ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
dst_stride_bgra = -dst_stride_bgra;
}
#if defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToBGRARow = I422ToBGRARow_AVX2;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
dst_bgra += dst_stride_bgra;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
int y;
void (*I422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToABGRRow_C;
if (!src_y || !src_u || !src_v || !dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToABGRRow = I422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToABGRRow = I422ToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to RGBA.
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
int y; int y;
void (*I422ToRGBARow)(const uint8* y_buf, void (*I422ToRGBARow)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGBARow_C; int width) = I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba || if (!src_y || !src_u || !src_v || !dst_rgba ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -695,9 +498,18 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORGBAROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width); I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
dst_rgba += dst_stride_rgba; dst_rgba += dst_stride_rgba;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -708,18 +520,49 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I420 to RGB24. // Convert I420 to RGBA.
LIBYUV_API LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y, int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24, uint8* dst_rgba, int dst_stride_rgba,
int width, int height) { int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgba, dst_stride_rgba,
&kYuvI601Constants,
width, height);
}
// Convert I420 to BGRA.
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_bgra, dst_stride_bgra,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to RGB24 with matrix
static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y; int y;
void (*I422ToRGB24Row)(const uint8* y_buf, void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB24Row_C; int width) = I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 || if (!src_y || !src_u || !src_v || !dst_rgb24 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -757,7 +600,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width); I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
dst_rgb24 += dst_stride_rgb24; dst_rgb24 += dst_stride_rgb24;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -768,64 +611,34 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Convert I420 to RGB24.
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
return I420ToRGB24Matrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgb24, dst_stride_rgb24,
&kYuvI601Constants,
width, height);
}
// Convert I420 to RAW. // Convert I420 to RAW.
LIBYUV_API LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y, int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_raw, int dst_stride_raw, uint8* dst_raw, int dst_stride_raw,
int width, int height) { int width, int height) {
int y; return I420ToRGB24Matrix(src_y, src_stride_y,
void (*I422ToRAWRow)(const uint8* y_buf, src_v, src_stride_v, // Swap U and V
const uint8* u_buf, src_u, src_stride_u,
const uint8* v_buf, dst_raw, dst_stride_raw,
uint8* rgb_buf, &kYvuI601Constants, // Use Yvu matrix
int width) = I422ToRAWRow_C; width, height);
if (!src_y || !src_u || !src_v || !dst_raw ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_raw = dst_raw + (height - 1) * dst_stride_raw;
dst_stride_raw = -dst_stride_raw;
}
#if defined(HAS_I422TORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRAWRow = I422ToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRAWRow = I422ToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
dst_raw += dst_stride_raw;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
} }
// Convert I420 to ARGB1555. // Convert I420 to ARGB1555.
@ -840,6 +653,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C; int width) = I422ToARGB1555Row_C;
if (!src_y || !src_u || !src_v || !dst_argb1555 || if (!src_y || !src_u || !src_v || !dst_argb1555 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -877,7 +691,8 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width); I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
width);
dst_argb1555 += dst_stride_argb1555; dst_argb1555 += dst_stride_argb1555;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -901,6 +716,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C; int width) = I422ToARGB4444Row_C;
if (!src_y || !src_u || !src_v || !dst_argb4444 || if (!src_y || !src_u || !src_v || !dst_argb4444 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -938,7 +754,8 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width); I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
width);
dst_argb4444 += dst_stride_argb4444; dst_argb4444 += dst_stride_argb4444;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -961,6 +778,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB565Row_C; int width) = I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -998,7 +816,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width); I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
dst_rgb565 += dst_stride_rgb565; dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y; src_y += src_stride_y;
if (y & 1) { if (y & 1) {
@ -1029,9 +847,10 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* rgb_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C; int width) = I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
@ -1069,12 +888,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) #if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; I422ToARGBRow = I422ToARGBRow_DSPR2;
} }
#endif #endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@ -1105,7 +924,7 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
// Allocate a row of argb. // Allocate a row of argb.
align_buffer_64(row_argb, width * 4); align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, width); I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565, ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width); *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
dst_rgb565 += dst_stride_rgb565; dst_rgb565 += dst_stride_rgb565;
@ -1258,7 +1077,6 @@ int ConvertFromI420(const uint8* y, int y_stride,
// Triplanar formats // Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth // TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420: case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: { case FOURCC_YV12: {
int halfwidth = (width + 1) / 2; int halfwidth = (width + 1) / 2;
int halfheight = (height + 1) / 2; int halfheight = (height + 1) / 2;

View File

@ -28,10 +28,10 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV444Row_C; int width) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1; return -1;
} }
@ -109,13 +109,16 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
int pix) = ARGBToUV422Row_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1; return -1;
} }
// Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb; src_argb = src_argb + (height - 1) * src_stride_argb;
@ -130,34 +133,22 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
} }
#if defined(HAS_ARGBTOUV422ROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
} }
} }
@ -170,9 +161,17 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToUV422Row(src_argb, dst_u, dst_v, width); ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb, dst_y, width);
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_y += dst_stride_y; dst_y += dst_stride_y;
@ -191,8 +190,8 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV411Row_C; int width) = ARGBToUV411Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1; return -1;
@ -264,7 +263,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C; int width) = MergeUVRow_C;
@ -373,7 +372,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C; int width) = MergeUVRow_C;
@ -478,9 +477,9 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2, uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
int pix) = ARGBToUV422Row_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C; const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
@ -502,34 +501,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_yuy2 = 0; src_stride_argb = dst_stride_yuy2 = 0;
} }
#if defined(HAS_ARGBTOUV422ROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
} }
} }
@ -542,7 +529,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_SSE2) #if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@ -567,7 +561,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* row_v = row_u + ((width + 63) & ~63) / 2; uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToUV422Row(src_argb, row_u, row_v, width); ARGBToUVRow(src_argb, 0, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width); ARGBToYRow(src_argb, row_y, width);
I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
src_argb += src_stride_argb; src_argb += src_stride_argb;
@ -585,9 +579,9 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy, uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
int pix) = ARGBToUV422Row_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C; const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
@ -609,34 +603,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_uyvy = 0; src_stride_argb = dst_stride_uyvy = 0;
} }
#if defined(HAS_ARGBTOUV422ROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
} }
} }
@ -649,7 +631,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
#if defined(HAS_I422TOUYVYROW_SSE2) #if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@ -674,7 +663,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* row_v = row_u + ((width + 63) & ~63) / 2; uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToUV422Row(src_argb, row_u, row_v, width); ARGBToUVRow(src_argb, 0, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width); ARGBToYRow(src_argb, row_y, width);
I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
src_argb += src_stride_argb; src_argb += src_stride_argb;
@ -692,7 +681,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
ARGBToYRow_C; ARGBToYRow_C;
if (!src_argb || !dst_y || width <= 0 || height == 0) { if (!src_argb || !dst_y || width <= 0 || height == 0) {
return -1; return -1;
@ -764,7 +753,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24, uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB24Row_C; ARGBToRGB24Row_C;
if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
return -1; return -1;
@ -812,7 +801,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
uint8* dst_raw, int dst_stride_raw, uint8* dst_raw, int dst_stride_raw,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRAWRow_C; ARGBToRAWRow_C;
if (!src_argb || !dst_raw || width <= 0 || height == 0) { if (!src_argb || !dst_raw || width <= 0 || height == 0) {
return -1; return -1;
@ -869,7 +858,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
const uint8* dither4x4, int width, int height) { const uint8* dither4x4, int width, int height) {
int y; int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1; return -1;
} }
@ -921,7 +910,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToRGB565Row_C; ARGBToRGB565Row_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1; return -1;
@ -977,7 +966,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb1555, int dst_stride_argb1555, uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB1555Row_C; ARGBToARGB1555Row_C;
if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
return -1; return -1;
@ -1033,7 +1022,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb4444, int dst_stride_argb4444, uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
ARGBToARGB4444Row_C; ARGBToARGB4444Row_C;
if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
return -1; return -1;
@ -1093,7 +1082,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
int y; int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C; ARGBToYJRow_C;
if (!src_argb || if (!src_argb ||
!dst_yj || !dst_u || !dst_v || !dst_yj || !dst_u || !dst_v ||
@ -1157,21 +1146,24 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// ARGB little endian (bgra in memory) to J422 // Convert ARGB to J422. (JPeg full range I422).
LIBYUV_API LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb, int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y, uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
int pix) = ARGBToUVJ422Row_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C; ARGBToYJRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1; return -1;
} }
// Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb; src_argb = src_argb + (height - 1) * src_stride_argb;
@ -1179,34 +1171,19 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
} }
// Coalesce rows. // Coalesce rows.
if (src_stride_argb == width * 4 && if (src_stride_argb == width * 4 &&
dst_stride_y == width && dst_stride_yj == width &&
dst_stride_u * 2 == width && dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) { dst_stride_v * 2 == width) {
width *= height; width *= height;
height = 1; height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
} }
#if defined(HAS_ARGBTOUVJ422ROW_SSSE3) #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJ422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3;
} }
} }
@ -1227,12 +1204,20 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToUVJ422Row(src_argb, dst_u, dst_v, width); ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_y, width); ARGBToYJRow(src_argb, dst_yj, width);
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_y += dst_stride_y; dst_yj += dst_stride_yj;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
@ -1245,7 +1230,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj, uint8* dst_yj, int dst_stride_yj,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
ARGBToYJRow_C; ARGBToYJRow_C;
if (!src_argb || !dst_yj || width <= 0 || height == 0) { if (!src_argb || !dst_yj || width <= 0 || height == 0) {
return -1; return -1;

View File

@ -9,6 +9,7 @@
*/ */
#include "libyuv/convert.h" #include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
#ifdef HAVE_JPEG #ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h" #include "libyuv/mjpeg_decoder.h"

View File

@ -23,7 +23,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// Convert camera sample to I420 with cropping, rotation and vertical flip. // Convert camera sample to ARGB with cropping, rotation and vertical flip.
// src_width is used for source stride computation // src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion // src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame. // sample_size is measured in bytes and is the size of the frame.
@ -51,8 +51,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// also enable temporary buffer. // also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample; crop_argb == sample;
uint8* tmp_argb = crop_argb; uint8* dest_argb = crop_argb;
int tmp_argb_stride = argb_stride; int dest_argb_stride = argb_stride;
uint8* rotate_buffer = NULL; uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
@ -66,13 +66,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
} }
if (need_buf) { if (need_buf) {
int argb_size = crop_width * abs_crop_height * 4; int argb_size = crop_width * 4 * abs_crop_height;
rotate_buffer = (uint8*)malloc(argb_size); rotate_buffer = (uint8*)malloc(argb_size);
if (!rotate_buffer) { if (!rotate_buffer) {
return 1; // Out of memory runtime error. return 1; // Out of memory runtime error.
} }
crop_argb = rotate_buffer; crop_argb = rotate_buffer;
argb_stride = crop_width; argb_stride = crop_width * 4;
} }
switch (format) { switch (format) {
@ -176,7 +176,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
break; break;
// Triplanar formats // Triplanar formats
case FOURCC_I420: case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: { case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x); const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u; const uint8* src_u;
@ -291,7 +290,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) { if (need_buf) {
if (!r) { if (!r) {
r = ARGBRotate(crop_argb, argb_stride, r = ARGBRotate(crop_argb, argb_stride,
tmp_argb, tmp_argb_stride, dest_argb, dest_argb_stride,
crop_width, abs_crop_height, rotation); crop_width, abs_crop_height, rotation);
} }
free(rotate_buffer); free(rotate_buffer);

View File

@ -39,12 +39,13 @@ int ConvertToI420(const uint8* sample,
int aligned_src_width = (src_width + 1) & ~1; int aligned_src_width = (src_width + 1) & ~1;
const uint8* src; const uint8* src;
const uint8* src_uv; const uint8* src_uv;
int abs_src_height = (src_height < 0) ? -src_height : src_height; const int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; // TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0; int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 && format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample; format != FOURCC_YV12) || y == sample;
uint8* tmp_y = y; uint8* tmp_y = y;
uint8* tmp_u = u; uint8* tmp_u = u;
uint8* tmp_v = v; uint8* tmp_v = v;
@ -52,16 +53,14 @@ int ConvertToI420(const uint8* sample,
int tmp_u_stride = u_stride; int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride; int tmp_v_stride = v_stride;
uint8* rotate_buffer = NULL; uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!y || !u || !v || !sample || if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 || src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) { src_height == 0 || crop_height == 0) {
return -1; return -1;
} }
if (src_height < 0) {
inv_crop_height = -inv_crop_height;
}
// One pass rotation is available for some formats. For the rest, convert // One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer, // to I420 (with optional vertical flipping) into a temporary I420 buffer,
@ -214,7 +213,6 @@ int ConvertToI420(const uint8* sample,
break; break;
// Triplanar formats // Triplanar formats
case FOURCC_I420: case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: { case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x); const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u; const uint8* src_u;

View File

@ -10,12 +10,12 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) #if defined(_MSC_VER)
#include <intrin.h> // For __cpuidex() #include <intrin.h> // For __cpuidex()
#endif #endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \ #if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219) defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv() #include <immintrin.h> // For _xgetbv()
#endif #endif
@ -36,7 +36,8 @@ extern "C" {
// For functions that use the stack and have runtime checks for overflow, // For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check. // use SAFEBUFFERS to avoid additional check.
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
!defined(__clang__)
#define SAFEBUFFERS __declspec(safebuffers) #define SAFEBUFFERS __declspec(safebuffers)
#else #else
#define SAFEBUFFERS #define SAFEBUFFERS
@ -48,9 +49,9 @@ extern "C" {
!defined(__pnacl__) && !defined(__CLR_VER) !defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) #if defined(_MSC_VER)
// Visual C version uses intrinsic or inline x86 assembly. // Visual C version uses intrinsic or inline x86 assembly.
#if (_MSC_FULL_VER >= 160040219) #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx); __cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86) #elif defined(_M_IX86)
__asm { __asm {
@ -63,7 +64,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
mov [edi + 8], ecx mov [edi + 8], ecx
mov [edi + 12], edx mov [edi + 12], edx
} }
#else #else // Visual C but not x86
if (info_ecx == 0) { if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax); __cpuid((int*)(cpu_info), info_eax);
} else { } else {
@ -71,9 +72,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
} }
#endif #endif
// GCC version uses inline x86 assembly. // GCC version uses inline x86 assembly.
#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) #else // defined(_MSC_VER)
uint32 info_ebx, info_edx; uint32 info_ebx, info_edx;
asm volatile ( // NOLINT asm volatile (
#if defined( __i386__) && defined(__PIC__) #if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit. // Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n" "mov %%ebx, %%edi \n"
@ -89,7 +90,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[1] = info_ebx; cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx; cpu_info[2] = info_ecx;
cpu_info[3] = info_edx; cpu_info[3] = info_edx;
#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) #endif // defined(_MSC_VER)
} }
#else // (defined(_M_IX86) || defined(_M_X64) ... #else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API LIBYUV_API
@ -98,28 +99,37 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
} }
#endif #endif
// TODO(fbarchard): Enable xgetbv when validator supports it. // For VS2010 and earlier emit can be used:
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
// __asm {
// xor ecx, ecx // xcr 0
// xgetbv
// mov xcr0, eax
// }
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
// https://code.google.com/p/libyuv/issues/detail?id=529
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
#if (defined(_M_IX86) || defined(_M_X64) || \ #if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \ defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV #define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() { int GetXCR0() {
uint32 xcr0 = 0u; uint32 xcr0 = 0u;
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__) #elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__) #endif // defined(__i386__) || defined(__x86_64__)
return((xcr0 & 6) == 6); // Is ymm saved? return xcr0;
} }
#endif // defined(_M_IX86) || defined(_M_X64) .. #endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", on)
#endif
// based on libvpx arm_cpudetect.c // based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU // For Arm, but public to allow testing on any CPU
@ -151,30 +161,9 @@ int ArmCpuCaps(const char* cpuinfo_name) {
return 0; return 0;
} }
#if defined(__mips__) && defined(__linux__)
static int MipsCpuCaps(const char* search_string) {
char cpuinfo_line[512];
const char* file_name = "/proc/cpuinfo";
FILE* f = fopen(file_name, "r");
if (!f) {
// Assume DSP if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasMIPS_DSP;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
return kCpuHasMIPS_DSP;
}
}
fclose(f);
return 0;
}
#endif
// CPU detect function for SIMD instruction sets. // CPU detect function for SIMD instruction sets.
LIBYUV_API LIBYUV_API
int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. int cpu_info_ = 0; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value // Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off. // to disable. Zero ignored to make it easy to set the variable on/off.
@ -197,8 +186,9 @@ static LIBYUV_BOOL TestEnv(const char*) {
LIBYUV_API SAFEBUFFERS LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) { int InitCpuFlags(void) {
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86) #if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 }; uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 }; uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 }; uint32 cpu_info7[4] = { 0, 0, 0, 0 };
@ -207,66 +197,66 @@ int InitCpuFlags(void) {
if (cpu_info0[0] >= 7) { if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7); CpuId(7, 0, cpu_info7);
} }
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86; kCpuHasX86;
#ifdef HAS_XGETBV #ifdef HAS_XGETBV
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
TestOsSaveYmm()) { // Saves YMM. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
kCpuHasAVX; cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
} }
#endif #endif
// Environment variable overrides for testing. // Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) { if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info_ &= ~kCpuHasX86; cpu_info &= ~kCpuHasX86;
} }
if (TestEnv("LIBYUV_DISABLE_SSE2")) { if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info_ &= ~kCpuHasSSE2; cpu_info &= ~kCpuHasSSE2;
} }
if (TestEnv("LIBYUV_DISABLE_SSSE3")) { if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info_ &= ~kCpuHasSSSE3; cpu_info &= ~kCpuHasSSSE3;
} }
if (TestEnv("LIBYUV_DISABLE_SSE41")) { if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info_ &= ~kCpuHasSSE41; cpu_info &= ~kCpuHasSSE41;
} }
if (TestEnv("LIBYUV_DISABLE_SSE42")) { if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info_ &= ~kCpuHasSSE42; cpu_info &= ~kCpuHasSSE42;
} }
if (TestEnv("LIBYUV_DISABLE_AVX")) { if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info_ &= ~kCpuHasAVX; cpu_info &= ~kCpuHasAVX;
} }
if (TestEnv("LIBYUV_DISABLE_AVX2")) { if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info_ &= ~kCpuHasAVX2; cpu_info &= ~kCpuHasAVX2;
} }
if (TestEnv("LIBYUV_DISABLE_ERMS")) { if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info_ &= ~kCpuHasERMS; cpu_info &= ~kCpuHasERMS;
} }
if (TestEnv("LIBYUV_DISABLE_FMA3")) { if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info_ &= ~kCpuHasFMA3; cpu_info &= ~kCpuHasFMA3;
}
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
} }
#endif #endif
#if defined(__mips__) && defined(__linux__) #if defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2) #if defined(__mips_dspr2)
cpu_info_ |= kCpuHasMIPS_DSPR2; cpu_info |= kCpuHasDSPR2;
#endif #endif
cpu_info_ |= kCpuHasMIPS; cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
if (getenv("LIBYUV_DISABLE_MIPS")) { cpu_info &= ~kCpuHasDSPR2;
cpu_info_ &= ~kCpuHasMIPS;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
cpu_info_ &= ~kCpuHasMIPS_DSP;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
} }
#endif #endif
#if defined(__arm__) || defined(__aarch64__) #if defined(__arm__) || defined(__aarch64__)
@ -274,28 +264,31 @@ int InitCpuFlags(void) {
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info_ = kCpuHasNEON; cpu_info = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it. // flag in it.
// So for aarch64, neon enabling is hard coded here. // So for aarch64, neon enabling is hard coded here.
#endif #endif
#if defined(__aarch64__) #if defined(__aarch64__)
cpu_info_ = kCpuHasNEON; cpu_info = kCpuHasNEON;
#else #else
// Linux arm parse text file for neon detect. // Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); cpu_info = ArmCpuCaps("/proc/cpuinfo");
#endif #endif
cpu_info_ |= kCpuHasARM; cpu_info |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) { if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info_ &= ~kCpuHasNEON; cpu_info &= ~kCpuHasNEON;
} }
#endif // __arm__ #endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) { if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info_ = 0; cpu_info = 0;
} }
return cpu_info_; cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
} }
// Note that use of this function is not thread safe.
LIBYUV_API LIBYUV_API
void MaskCpuFlags(int enable_flags) { void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags; cpu_info_ = InitCpuFlags() & enable_flags;

View File

@ -59,8 +59,7 @@ const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
// Methods that are passed to jpeglib. // Methods that are passed to jpeglib.
boolean fill_input_buffer(jpeg_decompress_struct* cinfo); boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
void init_source(jpeg_decompress_struct* cinfo); void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo); void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo); void ErrorHandler(jpeg_common_struct* cinfo);
@ -429,8 +428,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
return TRUE; return TRUE;
} }
void skip_input_data(j_decompress_ptr cinfo, void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes; cinfo->src->next_input_byte += num_bytes;
} }

View File

@ -17,51 +17,22 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// Enable this to try scasb implementation. // Helper function to scan for EOI marker (0xff 0xd9).
// #define ENABLE_SCASB 1
#ifdef ENABLE_SCASB
// Multiple of 1.
__declspec(naked)
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // src
mov eax, [esp + 8] // val
mov ecx, [esp + 12] // count
repne scasb
jne sr99
mov eax, edi
sub eax, 1
mov edi, edx
ret
sr99:
mov eax, 0
mov edi, edx
ret
}
}
#endif
// Helper function to scan for EOI marker.
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* end = sample + sample_size - 1; if (sample_size >= 2) {
const uint8* it = sample; const uint8* end = sample + sample_size - 1;
for (;;) { const uint8* it = sample;
#ifdef ENABLE_SCASB while (it < end) {
it = ScanRow_ERMS(it, 0xff, end - it); // TODO(fbarchard): scan for 0xd9 instead.
#else it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
it = static_cast<const uint8*>(memchr(it, 0xff, end - it)); if (it == NULL) {
#endif break;
if (it == NULL) { }
break; if (it[1] == 0xd9) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
++it; // Skip over current 0xff.
} }
if (it[1] == 0xd9) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
++it; // Skip over current 0xff.
} }
// ERROR: Invalid jpeg end code not found. Size sample_size // ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE; return LIBYUV_FALSE;
@ -69,20 +40,19 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
// Helper function to validate the jpeg appears intact. // Helper function to validate the jpeg appears intact.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024; const size_t kBackSearchSize = 1024;
if (sample_size < 64) { if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
// ERROR: Invalid jpeg size: sample_size // ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE; return LIBYUV_FALSE;
} }
if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
// ERROR: Invalid jpeg initial start code // ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE; return LIBYUV_FALSE;
} }
// Step over SOI marker.
sample += 2;
sample_size -= 2;
// Look for the End Of Image (EOI) marker in the end kilobyte of the buffer. // Look for the End Of Image (EOI) marker near the end of the buffer.
if (sample_size > kBackSearchSize) { if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg. return LIBYUV_TRUE; // Success: Valid jpeg.
@ -90,8 +60,8 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Reduce search size for forward search. // Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1; sample_size = sample_size - kBackSearchSize + 1;
} }
return ScanEOI(sample, sample_size); // Step over SOI marker and scan for EOI.
return ScanEOI(sample + 2, sample_size - 2);
} }
#ifdef __cplusplus #ifdef __cplusplus

File diff suppressed because it is too large Load Diff

View File

@ -49,13 +49,13 @@ void TransposePlane(const uint8* src, int src_stride,
} }
} }
#endif #endif
#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) #if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasDSPR2)) {
if (IS_ALIGNED(width, 4) && if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; TransposeWx8 = TransposeWx8_Fast_DSPR2;
} else { } else {
TransposeWx8 = TransposeWx8_MIPS_DSPR2; TransposeWx8 = TransposeWx8_DSPR2;
} }
} }
#endif #endif
@ -117,14 +117,6 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
} }
#endif #endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3) #if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3; MirrorRow = MirrorRow_Any_SSSE3;
@ -142,11 +134,11 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
#endif #endif
// TODO(fbarchard): Mirror on mips handle unaligned memory. // TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2) #if defined(HAS_MIRRORROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_MIPS_DSPR2; MirrorRow = MirrorRow_DSPR2;
} }
#endif #endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
@ -204,14 +196,17 @@ void TransposeUV(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_TRANSPOSEUVWX8_SSE2) #if defined(HAS_TRANSPOSEUVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasSSE2)) {
TransposeUVWx8 = TransposeUVWx8_SSE2; TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
} }
#endif #endif
#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) #if defined(HAS_TRANSPOSEUVWX8_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; TransposeUVWx8 = TransposeUVWx8_DSPR2;
} }
#endif #endif
@ -272,22 +267,22 @@ void RotateUV180(const uint8* src, int src_stride,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int width, int height) { int width, int height) {
int i; int i;
void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C; MirrorUVRow_C;
#if defined(HAS_MIRRORUVROW_NEON) #if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorRowUV = MirrorUVRow_NEON; MirrorUVRow = MirrorUVRow_NEON;
} }
#endif #endif
#if defined(HAS_MIRRORROW_UV_SSSE3) #if defined(HAS_MIRRORUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRowUV = MirrorUVRow_SSSE3; MirrorUVRow = MirrorUVRow_SSSE3;
} }
#endif #endif
#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) #if defined(HAS_MIRRORUVROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorRowUV = MirrorUVRow_MIPS_DSPR2; MirrorUVRow = MirrorUVRow_DSPR2;
} }
#endif #endif
@ -295,7 +290,7 @@ void RotateUV180(const uint8* src, int src_stride,
dst_b += dst_stride_b * (height - 1); dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) { for (i = 0; i < height; ++i) {
MirrorRowUV(src, dst_a, dst_b, width); MirrorUVRow(src, dst_a, dst_b, width);
src += src_stride; src += src_stride;
dst_a -= dst_stride_a; dst_a -= dst_stride_a;
dst_b -= dst_stride_b; dst_b -= dst_stride_b;

View File

@ -18,7 +18,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \ #define TANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \ void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \ uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \ int r = width & MASK; \
@ -26,24 +26,49 @@ extern "C" {
if (n > 0) { \ if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \ } \
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
} }
#ifdef HAS_TRANSPOSEWX8_NEON #ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
#endif #endif
#ifdef HAS_TRANSPOSEWX8_SSSE3 #ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif #endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif #endif
#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2 #ifdef HAS_TRANSPOSEWX8_DSPR2
TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7) TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#endif #endif
#undef TANY #undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst_a, int dst_stride_a, \
uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, \
dst_a + n * dst_stride_a, dst_stride_a, \
dst_b + n * dst_stride_b, dst_stride_b, r); \
}
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
#endif
#undef TUVANY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -17,16 +17,17 @@ extern "C" {
#endif #endif
// This module is for GCC x86 and x64. // This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
void TransposeWx8_SSSE3(const uint8* src, int src_stride, void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n" "movq (%0,%3),%%xmm1 \n"
@ -105,386 +106,260 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) // Transpose 16x8. 64 bit
void TransposeUVWx8_SSE2(const uint8* src, int src_stride, #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n" "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n" "movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n" "palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n" "movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n" "movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n" "punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n" "movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n" "palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n" "movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n" "movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n" "punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n" "movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n" "palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n" "movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n" "movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n" "punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n" "neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n" "movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n" "movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n" "lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n" "neg %3 \n"
// Second round of bit swap. // Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n" "palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n" "punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n" "movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n" "punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n" "punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n" "movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n" "movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n" "palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n" "palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n" "punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n" "punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n" "movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n" "movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n" "palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap. // Third round of bit swap.
// Write to the destination pointer. // Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n" "punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n" "palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n" "movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n" "punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n" "movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n" "movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n" "punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n" "movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n" "movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n" "movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n" "movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n" "punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n" "movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n" "punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n" "movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n" "movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n" "palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n" "movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n" "punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n" "movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n" "movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n" "palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n" "punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n" "movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n" "movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n" "movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n" "palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n" "movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n" "punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n" "movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n" "movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n" "movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3 : "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4 "r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", : "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
); );
} }
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride, void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) { uint8* dst_b, int dst_stride_b, int width) {
asm volatile ( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
".p2align 2 \n" LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n" "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n" "movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n" "movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n" "movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n" "punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n" "movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n" "movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n" "punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n" "movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n" "movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n" "movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n" "neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n" "lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n" "punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n" "movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n" "neg %4 \n"
// Second round of bit swap. // Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n" "movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n" "punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n" "punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n" "movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n" "movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n" "movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n" "punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n" "punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n" "punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n" "movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n" "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap. // Third round of bit swap.
// Write to the destination pointer. // Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n" "punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel "movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel "movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n" "punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n" "movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n" "punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n" "movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n" "movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n" "punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n" "movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n" "punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n" "movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n" "movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n" "punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n" "movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n" "punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n" "movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n" "movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n" "punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n" "sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4 : "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5 "r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6 "r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc", : "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9" "xmm8", "xmm9"
); );
} }
#endif #endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -22,8 +22,8 @@ extern "C" {
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) (_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -106,8 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
); );
} }
void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set noat \n" ".set noat \n"
".set push \n" ".set push \n"
@ -308,10 +308,10 @@ void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
); );
} }
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int width) { int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"

View File

@ -27,7 +27,7 @@ static uvec8 kVTbl4x4Transpose =
void TransposeWx8_NEON(const uint8* src, int src_stride, void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int width) { int width) {
const uint8* src_temp = NULL; const uint8* src_temp;
asm volatile ( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %5, #8 \n" "sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
@ -230,7 +229,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"4: \n" "4: \n"
: "+r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst), // %3 "+r"(dst), // %3
@ -248,7 +247,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int width) { int width) {
const uint8* src_temp = NULL; const uint8* src_temp;
asm volatile ( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %7, #8 \n" "sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
@ -514,7 +512,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"4: \n" "4: \n"
: "+r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst_a), // %3 "+r"(dst_a), // %3

View File

@ -26,7 +26,7 @@ static uvec8 kVTbl4x4Transpose =
void TransposeWx8_NEON(const uint8* src, int src_stride, void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) { uint8* dst, int dst_stride, int width) {
const uint8* src_temp = NULL; const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning. int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile ( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
@ -235,7 +235,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"4: \n" "4: \n"
: "+r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(dst), // %2 "+r"(dst), // %2
"+r"(width64) // %3 "+r"(width64) // %3
@ -255,7 +255,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int width) { int width) {
const uint8* src_temp = NULL; const uint8* src_temp;
int64 width64 = (int64) width; // Work around clang 3.4 warning. int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile ( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
@ -520,7 +520,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"4: \n" "4: \n"
: "+r"(src_temp), // %0 : "=&r"(src_temp), // %0
"+r"(src), // %1 "+r"(src), // %1
"+r"(dst_a), // %2 "+r"(dst_a), // %2
"+r"(dst_b), // %3 "+r"(dst_b), // %3

View File

@ -16,9 +16,8 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// This module is for Visual C x86. // This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked) __declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride, void TransposeWx8_SSSE3(const uint8* src, int src_stride,

View File

@ -22,6 +22,39 @@ extern "C" {
// Subsampled source needs to be increase by 1 of not even. // Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
const uint8* a_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_AVX2
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif
#undef ANY41C
// Any 3 planes to 1. // Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
@ -40,75 +73,9 @@ extern "C" {
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \ SS(r, DUVSHIFT) * BPP); \
} }
#ifdef HAS_I422TOYUY2ROW_SSE2
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_I422TORAWROW_AVX2
ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_J422TOARGBROW_SSSE3
ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_J422TOARGBROW_AVX2
ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOBGRAROW_AVX2
ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOABGRROW_AVX2
ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_AVX2
ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif #endif
#ifdef HAS_I422TOYUY2ROW_NEON #ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
@ -116,8 +83,91 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#ifdef HAS_I422TOUYVYROW_NEON #ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif #endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
#undef ANY31 #undef ANY31
// Note that odd width replication includes 444 due to implementation
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I411TOARGBROW_SSSE3
ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_AVX2
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
#undef ANY31C
// Any 2 planes to 1. // Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
@ -136,32 +186,6 @@ ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
} }
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
// Merge functions. // Merge functions.
#ifdef HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_SSE2
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
@ -221,6 +245,55 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#endif #endif
#undef ANY21 #undef ANY21
// Any 2 planes to 1 with yuvconstants
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_AVX2
ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV21TOARGBROW_NEON
ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
#undef ANY21C
// Any 1 to 1. // Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
@ -252,8 +325,10 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif #endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2) #if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif #endif
@ -269,15 +344,16 @@ ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_I400TOARGBROW_AVX2) #if defined(HAS_I400TOARGBROW_AVX2)
ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif #endif
#if defined(HAS_YUY2TOARGBROW_SSSE3) #if defined(HAS_RGB24TOARGBROW_SSSE3)
ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif #endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
#if defined(HAS_RGB565TOARGBROW_AVX2) #if defined(HAS_RGB565TOARGBROW_AVX2)
ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
#endif #endif
@ -287,10 +363,6 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
#if defined(HAS_ARGB4444TOARGBROW_AVX2) #if defined(HAS_ARGB4444TOARGBROW_AVX2)
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
#endif #endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON) #if defined(HAS_ARGBTORGB24ROW_NEON)
ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
@ -299,8 +371,9 @@ ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) #endif
ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif #endif
#ifdef HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
@ -381,9 +454,6 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif #endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
#endif #endif
@ -396,8 +466,44 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_NEON #ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif #endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#endif
#undef ANY11 #undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
#endif
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
#undef ANY11B
// Any 1 to 1 with parameter. // Any 1 to 1 with parameter.
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
@ -440,6 +546,35 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif #endif
#undef ANY11P #undef ANY11P
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
#endif
#if defined(HAS_YUY2TOARGBROW_NEON)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride. // Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
@ -464,14 +599,11 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
#ifdef HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSSE3
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
#endif #endif
#ifdef HAS_INTERPOLATEROW_SSE2
ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON #ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif #endif
#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 #ifdef HAS_INTERPOLATEROW_DSPR2
ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3) ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#endif #endif
#undef ANY11T #undef ANY11T
@ -496,9 +628,6 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif #endif
#ifdef HAS_MIRRORROW_SSE2
ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON #ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif #endif
@ -548,9 +677,25 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
ANY_SIMD(src_ptr, dst_u, dst_v, n); \ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \ } \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ /* repeat last 4 bytes for 422 subsampler */ \
if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \ memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
/* repeat last 4 - 12 bytes for 411 subsampler */ \
if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
} \
if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
} \
if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \ } \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
@ -566,8 +711,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
#ifdef HAS_SPLITUVROW_NEON #ifdef HAS_SPLITUVROW_NEON
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#endif #endif
#ifdef HAS_SPLITUVROW_MIPS_DSPR2 #ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15) ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#endif #endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@ -576,16 +721,12 @@ ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
#endif #endif
#ifdef HAS_ARGBTOUV422ROW_SSSE3
ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_SSE2 #ifdef HAS_YUY2TOUV422ROW_SSE2
ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
#endif #endif
#ifdef HAS_YUY2TOUV422ROW_NEON #ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
@ -607,11 +748,11 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \ SS(r, UVSHIFT) * BPP); \
if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
memcpy(temp + SS(r, UVSHIFT) * BPP, \ memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \ } \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
@ -621,6 +762,9 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#ifdef HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif #endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -375,13 +375,13 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_MIPS #endif // HAS_COPYROW_MIPS
// MIPS DSPR2 functions // DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \ (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez $t4, 2f \n" "blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual " andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n" "1: \n"
"addiu $t4, $t4, -1 \n" "addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
@ -447,7 +446,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
); );
} }
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
"blez $t4, 2f \n" "blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width " addu %[src], %[src], %[width] \n" // src += width
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0| "lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4| "lw $t1, -12(%[src]) \n" // |7|6|5|4|
@ -498,10 +496,10 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
); );
} }
void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
int x = 0; int x;
int y = 0; int y;
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez %[x], 2f \n" "blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n" " addu %[src_uv], %[src_uv], $t4 \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
@ -582,7 +579,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
[dst_u] "+r" (dst_u), [dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v), [dst_v] "+r" (dst_v),
[x] "=&r" (x), [x] "=&r" (x),
[y] "+r" (y) [y] "=&r" (y)
: [width] "r" (width) : [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4", : "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9" "t5", "t7", "t8", "t9"
@ -596,7 +593,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
// t8 = | 0 | G1 | 0 | g1 | // t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 | // t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 | // t1 = | 0 | R1 | 0 | r1 |
#define I422ToTransientMipsRGB \ #define YUVTORGB \
"lw $t0, 0(%[y_buf]) \n" \ "lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \ "lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \ "lhu $t2, 0(%[v_buf]) \n" \
@ -655,11 +652,13 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"addu.ph $t2, $t2, $s5 \n" \ "addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n" "addu.ph $t1, $t1, $s5 \n"
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, // TODO(fbarchard): accept yuv conversion constants.
const uint8* u_buf, void I422ToARGBRow_DSPR2(const uint8* y_buf,
const uint8* v_buf, const uint8* u_buf,
uint8* rgb_buf, const uint8* v_buf,
int width) { uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -673,9 +672,8 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
"lui $s6, 0xff00 \n" "lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
".p2align 2 \n"
"1: \n" "1: \n"
I422ToTransientMipsRGB YUVTORGB
// Arranging into argb format // Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
@ -717,136 +715,10 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
); );
} }
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
"precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
"precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
"precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
"precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
"precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
"or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
"or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
"sll $t1, $t1, 16 \n"
"sll $t2, $t2, 16 \n"
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1 // Bilinear filter 8x2 -> 8x1
void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width, ptrdiff_t src_stride, int dst_width,
int source_y_fraction) { int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction; int y0_fraction = 256 - source_y_fraction;
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
@ -857,7 +729,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
"replv.ph $t0, %[y0_fraction] \n" "replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n" "replv.ph $t1, %[source_y_fraction] \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t2, 0(%[src_ptr]) \n" "lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n" "lw $t3, 0(%[src_ptr1]) \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,146 +0,0 @@
;
; Copyright 2012 The LibYuv Project Authors. All rights reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01020000h
%error AVX2 is supported only by yasm 1.2.0 or later.
%endif
%endif
%include "x86inc.asm"
SECTION .text
; cglobal numeric constants are parameters, gpr regs, mm regs
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
%macro YUY2TOYROW 2-3
cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
%ifidn %1,YUY2
pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
psrlw m2, m2, 8
%endif
ALIGN 4
.convertloop:
mov%2 m0, [src_yuy2q]
mov%2 m1, [src_yuy2q + mmsize]
lea src_yuy2q, [src_yuy2q + mmsize * 2]
%ifidn %1,YUY2
pand m0, m0, m2 ; YUY2 even bytes are Y
pand m1, m1, m2
%else
psrlw m0, m0, 8 ; UYVY odd bytes are Y
psrlw m1, m1, 8
%endif
packuswb m0, m0, m1
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
%endif
sub pixd, mmsize
mov%2 [dst_yq], m0
lea dst_yq, [dst_yq + mmsize]
jg .convertloop
REP_RET
%endmacro
; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
INIT_MMX MMX
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_XMM SSE2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_YMM AVX2
YUY2TOYROW YUY2,a,
YUY2TOYROW UYVY,a,
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
%macro SplitUVRow 1-2
cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
psrlw m4, m4, 8
sub dst_vq, dst_uq
ALIGN 4
.convertloop:
mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize]
lea src_uvq, [src_uvq + mmsize * 2]
psrlw m2, m0, 8 ; odd bytes
psrlw m3, m1, 8
pand m0, m0, m4 ; even bytes
pand m1, m1, m4
packuswb m0, m0, m1
packuswb m2, m2, m3
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
vpermq m2, m2, 0xd8
%endif
mov%1 [dst_uq], m0
mov%1 [dst_uq + dst_vq], m2
lea dst_uq, [dst_uq + mmsize]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
SplitUVRow a,
SplitUVRow u,_Unaligned
INIT_XMM SSE2
SplitUVRow a,
SplitUVRow u,_Unaligned
INIT_YMM AVX2
SplitUVRow a,
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
; int width);
%macro MergeUVRow_ 1-2
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
ALIGN 4
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
lea src_uq, [src_uq + mmsize]
punpcklbw m2, m0, m1 // first 8 UV pairs
punpckhbw m0, m0, m1 // next 8 UV pairs
%if cpuflag(AVX2)
vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
mov%1 [dst_uvq], m1
mov%1 [dst_uvq + mmsize], m2
%else
mov%1 [dst_uvq], m2
mov%1 [dst_uvq + mmsize], m0
%endif
lea dst_uvq, [dst_uvq + mmsize * 2]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
INIT_XMM SSE2
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
INIT_YMM AVX2
MergeUVRow_ a,

View File

@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_SSE2) #if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
ScaleRowDown2Box_Any_SSE2); ScaleRowDown2Box_Any_SSSE3);
if (IS_ALIGNED(dst_width, 16)) { if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
ScaleRowDown2Box_SSE2); ScaleRowDown2Box_SSSE3);
} }
} }
#endif #endif
@ -85,12 +85,12 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN2_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ? ScaleRowDown2 = filtering ?
ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
} }
#endif #endif
@ -135,12 +135,12 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
ScaleRowDown2Box_16_SSE2); ScaleRowDown2Box_16_SSE2);
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN2_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ? ScaleRowDown2 = filtering ?
ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2; ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
} }
#endif #endif
@ -182,12 +182,12 @@ static void ScalePlaneDown4(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_SSE2) #if defined(HAS_SCALEROWDOWN4_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown4 = filtering ? ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2; ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
if (IS_ALIGNED(dst_width, 8)) { if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
} }
} }
#endif #endif
@ -200,12 +200,12 @@ static void ScalePlaneDown4(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN4_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ? ScaleRowDown4 = filtering ?
ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
} }
#endif #endif
@ -245,12 +245,12 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
ScaleRowDown4_16_SSE2; ScaleRowDown4_16_SSE2;
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN4_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ? ScaleRowDown4 = filtering ?
ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2; ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
} }
#endif #endif
@ -325,16 +325,16 @@ static void ScalePlaneDown34(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN34_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) { if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
} else { } else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
} }
} }
#endif #endif
@ -404,16 +404,16 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN34_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) { if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2; ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2; ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
} else { } else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2; ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2; ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
} }
} }
#endif #endif
@ -517,16 +517,16 @@ static void ScalePlaneDown38(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN38_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) { if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
} else { } else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
} }
} }
#endif #endif
@ -595,16 +595,16 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN38_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) { if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2; ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2; ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
} else { } else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2; ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2; ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
} }
} }
#endif #endif
@ -659,7 +659,6 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
int i; int i;
int scaletbl[2]; int scaletbl[2];
int minboxwidth = dx >> 16; int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth; int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@ -667,7 +666,8 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
boxwidth = MIN1((x >> 16) - ix); boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
scaletbl[boxwidth - minboxwidth] >> 16;
} }
} }
@ -676,7 +676,6 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
int i; int i;
int scaletbl[2]; int scaletbl[2];
int minboxwidth = dx >> 16; int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth; int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@ -684,8 +683,8 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
boxwidth = MIN1((x >> 16) - ix); boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; scaletbl[boxwidth - minboxwidth] >> 16;
} }
} }
@ -875,14 +874,6 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -907,11 +898,11 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(src_width, 4)) { if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
} }
#endif #endif
@ -1011,11 +1002,11 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(src_width, 4)) { if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_16_DSPR2;
} }
} }
#endif #endif
@ -1072,14 +1063,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -1104,11 +1087,11 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(dst_width, 4)) { if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
} }
#endif #endif
@ -1243,11 +1226,11 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width, 4)) { if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_16_DSPR2;
} }
} }
#endif #endif

View File

@ -55,12 +55,29 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
dst_ptr + n * BPP, r); \ dst_ptr + n * BPP, r); \
} }
#ifdef HAS_SCALEROWDOWN2_SSE2 // Fixed scale down for odd source width. Used by I420Blend subsampling.
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) // Since dst_width is (width + 1) / 2, this function scales one less pixel
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, // and copies the last pixel.
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
ScaleRowDown2Linear_C, 2, 1, 15) ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
2, 1, 15) 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
@ -68,6 +85,8 @@ SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31) ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31) 2, 1, 31)
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
2, 1, 31)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_NEON #ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
@ -75,10 +94,12 @@ SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15) ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15) ScaleRowDown2Box_C, 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN4_SSE2 #ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
4, 1, 7) 4, 1, 7)
#endif #endif
#ifdef HAS_SCALEROWDOWN4_AVX2 #ifdef HAS_SCALEROWDOWN4_AVX2

View File

@ -210,14 +210,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
src_argb += xl * 4; src_argb += xl * 4;
x -= (int)(xl << 16); x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -242,12 +234,12 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) { if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
} }
#endif #endif
@ -308,14 +300,6 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int x, int dx) = int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16; const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -340,10 +324,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
#endif #endif
if (src_width >= 32768) { if (src_width >= 32768) {
@ -481,27 +465,19 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) #if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; I422ToARGBRow = I422ToARGBRow_DSPR2;
} }
#endif #endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C; InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -526,10 +502,10 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
#endif #endif
@ -847,6 +823,36 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
free(argb_buffer);
return r;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -103,6 +103,28 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
dst_width -= 1;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
dst += 2;
s += 4;
t += 4;
}
if (dst_width & 1) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst += 1;
s += 2;
t += 2;
}
dst[0] = (s[0] + t[0] + 1) >> 1;
}
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) { uint16* dst, int dst_width) {
const uint16* s = src_ptr; const uint16* s = src_ptr;
@ -395,8 +417,14 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
} }
// (1-f)a + fb can be replaced with a + f(b-a) // (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) (uint8)((int)(a) + \ #define BLENDER(a, b, f) (uint8)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16)) ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// inteluses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
@ -448,8 +476,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
} }
#undef BLENDER #undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \ #define BLENDER(a, b, f) (uint16)((int)(a) + \
((int)(f) * ((int)(b) - (int)(a)) >> 16)) ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
@ -787,6 +816,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
} }
} }
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender // Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \ #define BLENDERC(a, b, f, s) (uint32)( \
@ -876,14 +906,6 @@ void ScalePlaneVertical(int src_height,
assert(dst_width > 0); assert(dst_width > 0);
assert(dst_height > 0); assert(dst_height > 0);
src_argb += (x >> 16) * bpp; src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -908,13 +930,13 @@ void ScalePlaneVertical(int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) { if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2; InterpolateRow = InterpolateRow_DSPR2;
} }
} }
#endif #endif
@ -982,13 +1004,13 @@ void ScalePlaneVertical_16(int src_height,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) #if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && if (TestCpuFlag(kCpuHasDSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_Any_16_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) { if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2; InterpolateRow = InterpolateRow_16_DSPR2;
} }
} }
#endif #endif

View File

@ -9,6 +9,7 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
@ -16,7 +17,8 @@ extern "C" {
#endif #endif
// This module is for GCC x86 and x64. // This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Offsets for source bytes 0 to 9 // Offsets for source bytes 0 to 9
static uvec8 kShuf0 = static uvec8 kShuf0 =
@ -96,8 +98,8 @@ static uvec16 kScaleAb2 =
// Generated using gcc disassembly on Visual C object file: // Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt // objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -118,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm3 \n" "pavgw %%xmm5,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "pavgw %%xmm5,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n" "packuswb %%xmm1,%%xmm0 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
@ -145,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5" :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
); );
} }
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0x8,%%xmm5 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -162,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"movdqa %%xmm1,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n" "psrlw $0x1,%%xmm0 \n"
"pand %%xmm5,%%xmm3 \n" "psrlw $0x1,%%xmm1 \n"
"pavgw %%xmm2,%%xmm0 \n" "pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n" "pavgw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
@ -186,7 +188,105 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1"
);
}
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
@ -214,12 +314,15 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
intptr_t stridex3 = 0; intptr_t stridex3;
asm volatile ( asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0x8,%%xmm7 \n" "psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"packuswb %%xmm4,%%xmm4 \n"
"psllw $0x3,%%xmm5 \n"
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n" "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
LABELALIGN LABELALIGN
@ -228,31 +331,29 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"pavgb %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 "pmaddubsw %%xmm4,%%xmm2 \n"
MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 "pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm3 \n"
"pavgb %%xmm5,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "phaddw %%xmm1,%%xmm0 \n"
"psrlw $0x8,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n" "psrlw $0x4,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "packuswb %%xmm0,%%xmm0 \n"
"pand %%xmm7,%%xmm2 \n"
"pand %%xmm7,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm7,%%xmm2 \n"
"pavgw %%xmm2,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(1) " \n" "movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
@ -260,13 +361,100 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(stridex3) // %3 "=&r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4 : "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
); );
} }
#ifdef HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpand %%ymm5,%%ymm0,%%ymm0 \n"
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
"vpsrlw $0x4,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(src_stride * 3)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
@ -574,61 +762,89 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
} }
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
asm volatile ( asm volatile (
"mov %0,%3 \n" // row pointer "pxor %%xmm5,%%xmm5 \n"
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(3) ",%%xmm2 \n" "movdqu " MEMACCESS(0) ",%%xmm3 \n"
"add %6,%3 \n" "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"movdqa %%xmm2,%%xmm3 \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"punpcklbw %%xmm4,%%xmm2 \n" "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
"punpckhbw %%xmm4,%%xmm3 \n" "movdqa %%xmm3,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpckhbw %%xmm5,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n" "paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n"
"jg 1b \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 "sub $0x10,%2 \n"
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(tmp_height), // %2 "+r"(src_width) // %2
"+r"(tmp_src), // %3 :
"+r"(src_width), // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
"+rm"(src_height) // %5
: "rm"((intptr_t)(src_stride)) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
); );
} }
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
asm volatile (
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
"vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
"vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version. // Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
intptr_t x0 = 0, x1 = 0, temp_pixel = 0; intptr_t x0, x1, temp_pixel;
asm volatile ( asm volatile (
"movd %6,%%xmm2 \n" "movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n" "movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n" "movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n" "movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n" "pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" // 0x007f007f
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" // 0x00010001
"pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n" "subl $0x2,%5 \n"
"jl 29f \n" "jl 29f \n"
@ -650,16 +866,19 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm4 \n" "movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n" "punpcklwd %%xmm4,%%xmm0 \n"
"pxor %%xmm6,%%xmm1 \n" "psubb %8,%%xmm0 \n" // make pixels signed.
"pmaddubsw %%xmm1,%%xmm0 \n" "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n" "pextrw $0x3,%%xmm2,%k4 \n"
"psrlw $0x7,%%xmm0 \n" "paddw %9,%%xmm1 \n" // make pixels unsigned.
"packuswb %%xmm0,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n"
"movd %%xmm0,%k2 \n" "packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,%k2 \n"
"mov %w2," MEMACCESS(0) " \n" "mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n" "lea " MEMLEA(0x2,0) ",%0 \n"
"sub $0x2,%5 \n" "subl $0x2,%5 \n"
"jge 2b \n" "jge 2b \n"
LABELALIGN LABELALIGN
@ -670,23 +889,37 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm0 \n" "movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n" "psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n" "pshufb %%xmm5,%%xmm2 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n" "pxor %%xmm6,%%xmm2 \n"
"pmaddubsw %%xmm2,%%xmm0 \n" "paddusb %%xmm7,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n" "pmaddubsw %%xmm0,%%xmm2 \n"
"packuswb %%xmm0,%%xmm0 \n" "paddw %9,%%xmm2 \n" // make pixels unsigned.
"movd %%xmm0,%k2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"mov %b2," MEMACCESS(0) " \n" "mov %b2," MEMACCESS(0) " \n"
"99: \n" "99: \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+a"(temp_pixel), // %2 "=&a"(temp_pixel), // %2
"+r"(x0), // %3 "=&r"(x0), // %3
"+r"(x1), // %4 "=&r"(x1), // %4
"+rm"(dst_width) // %5 #if defined(__x86_64__)
: "rm"(x), // %6 "+rm"(dst_width) // %5
"rm"(dx) // %7 #else
"+m"(dst_width) // %5
#endif
: "rm"(x), // %6
"rm"(dx), // %7
#if defined(__x86_64__)
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
} }
@ -795,7 +1028,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) { int src_stepx, uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0; intptr_t src_stepx_x12;
asm volatile ( asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n" "lea " MEMLEA3(0x00,1,4) ",%1 \n"
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n" "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
@ -813,11 +1046,11 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(src_stepx_x12) // %4 "=&r"(src_stepx_x12) // %4
:: "memory", "cc", NACL_R14 :: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3" "xmm0", "xmm1", "xmm2", "xmm3"
); );
@ -829,7 +1062,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx, ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0; intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride); intptr_t row1 = (intptr_t)(src_stride);
asm volatile ( asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n" "lea " MEMLEA3(0x00,1,4) ",%1 \n"
@ -858,12 +1091,12 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n" "sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+rm"(dst_width), // %3 "+rm"(dst_width), // %3
"+r"(src_stepx_x12), // %4 "=&r"(src_stepx_x12), // %4
"+r"(row1) // %5 "+r"(row1) // %5
:: "memory", "cc", NACL_R14 :: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3" "xmm0", "xmm1", "xmm2", "xmm3"
); );
@ -871,7 +1104,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
intptr_t x0 = 0, x1 = 0; intptr_t x0, x1;
asm volatile ( asm volatile (
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n" "movd %6,%%xmm3 \n"
@ -924,8 +1157,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"99: \n" "99: \n"
: "+a"(x0), // %0 : "=&a"(x0), // %0
"+d"(x1), // %1 "=&d"(x1), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(src_argb), // %3 "+r"(src_argb), // %3
"+r"(dst_width) // %4 "+r"(dst_width) // %4
@ -976,7 +1209,7 @@ static uvec8 kShuffleFractions = {
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
intptr_t x0 = 0, x1 = 0; intptr_t x0, x1;
asm volatile ( asm volatile (
"movdqa %0,%%xmm4 \n" "movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n" "movdqa %1,%%xmm5 \n"
@ -1039,8 +1272,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+rm"(dst_width), // %2 "+rm"(dst_width), // %2
"+r"(x0), // %3 "=&r"(x0), // %3
"+r"(x1) // %4 "=&r"(x1) // %4
: "rm"(x), // %5 : "rm"(x), // %5
"rm"(dx) // %6 "rm"(dx) // %6
: "memory", "cc", NACL_R14 : "memory", "cc", NACL_R14

View File

@ -21,8 +21,8 @@ extern "C" {
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) (_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
__asm__ __volatile__( __asm__ __volatile__(
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n" "beqz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -78,8 +77,8 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride; const uint8* t = src_ptr + src_stride;
__asm__ __volatile__ ( __asm__ __volatile__ (
@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n" "bltz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -178,8 +176,8 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n" "beqz $t9, 2f \n"
" nop \n" " nop \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -234,8 +231,8 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride; const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride; const uint8* s2 = s1 + stride;
@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n" "srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4| "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
@ -314,12 +310,11 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -361,14 +356,13 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) { uint8* d, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003 "repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@ -418,14 +412,13 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) { uint8* d, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003 "repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@ -471,13 +464,12 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@ -518,8 +510,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
const uint8* t = src_ptr + stride; const uint8* t = src_ptr + stride;
const int c = 0x2AAA; const int c = 0x2AAA;
@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
@ -572,9 +563,9 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride; intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride; const uint8* s1 = src_ptr + stride;
stride += stride; stride += stride;
@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
".p2align 2 \n"
"1: \n" "1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|

View File

@ -26,7 +26,6 @@ extern "C" {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) MEMACCESS(0)
@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3; const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
asm volatile ( asm volatile (
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(7) MEMACCESS(7)
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(5) MEMACCESS(5)
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
// d0 = 00 40 01 41 02 42 03 43 // d0 = 00 40 01 41 02 42 03 43
@ -543,9 +532,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL; const uint8* src_tmp;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"mov r12, %5 \n" "mov r12, %5 \n"
@ -564,12 +552,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop "subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_tmp), // %0 : "=&r"(src_tmp), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_ptr), // %2 "+r"(dst_ptr), // %2
"+r"(src_stride), // %3 "+r"(src_stride), // %3
"+r"(src_width), // %4 "+r"(src_width), // %4
"+r"(src_height) // %5 "+r"(src_height) // %5
: :
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
); );
@ -584,13 +572,16 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
MEMACCESS(6) \ MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// The NEON version mimics this formula:
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3}; int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_ptr; const uint8* src_tmp = src_ptr;
asm volatile ( asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x "vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx "vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
@ -621,8 +612,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"vmovl.u16 q10, d21 \n" "vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n" "vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n" "vmul.s32 q12, q12, q10 \n"
"vshrn.s32 d18, q11, #16 \n" "vrshrn.s32 d18, q11, #16 \n"
"vshrn.s32 d19, q12, #16 \n" "vrshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n" "vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n" "vmovn.s16 d6, q8 \n"
@ -749,7 +740,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
MEMACCESS(0) MEMACCESS(0)
@ -773,7 +763,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -804,7 +793,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -845,7 +833,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) { int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile ( asm volatile (
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
@ -875,7 +862,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
asm volatile ( asm volatile (
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
@ -927,10 +913,9 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
int tmp = 0; int tmp;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile (
".p2align 2 \n"
"1: \n" "1: \n"
LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d0, 1)
@ -945,13 +930,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
"vst1.32 {q0, q1}, [%0]! \n" // store pixels "vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(x), // %3 "+r"(x), // %3
"+r"(dx), // %4 "+r"(dx), // %4
"+r"(tmp), // %5 "=&r"(tmp), // %5
"+r"(src_tmp) // %6 "+r"(src_tmp) // %6
: :
: "memory", "cc", "q0", "q1" : "memory", "cc", "q0", "q1"
); );
@ -974,7 +959,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int* tmp = dx_offset; int* tmp = dx_offset;
const uint8* src_tmp = src_argb; const uint8* src_tmp = src_argb;
asm volatile ( asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x "vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx "vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vld1.32 {q2}, [%5] \n" // 0 1 2 3

View File

@ -547,7 +547,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL; const uint8* src_tmp;
asm volatile ( asm volatile (
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
@ -567,12 +567,12 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, #16 \n" "add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop "subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_tmp), // %0 : "=&r"(src_tmp), // %0
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(dst_ptr), // %2 "+r"(dst_ptr), // %2
"+r"(src_stride), // %3 "+r"(src_stride), // %3
"+r"(src_width), // %4 "+r"(src_width), // %4
"+r"(src_height) // %5 "+r"(src_height) // %5
: :
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
); );
@ -626,8 +626,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"ushll2 v6.4s, v6.8h, #0 \n" "ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n" "mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n" "mul v17.4s, v17.4s, v6.4s \n"
"shrn v6.4h, v16.4s, #16 \n" "rshrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n" "rshrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
@ -931,7 +931,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x; int64 x64 = (int64) x;
int64 dx64 = (int64) dx; int64 dx64 = (int64) dx;
int64 tmp64 = 0; int64 tmp64;
asm volatile ( asm volatile (
"1: \n" "1: \n"
LOAD1_DATA32_LANE(v0, 0) LOAD1_DATA32_LANE(v0, 0)
@ -947,13 +947,13 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width64), // %2 "+r"(dst_width64), // %2
"+r"(x64), // %3 "+r"(x64), // %3
"+r"(dx64), // %4 "+r"(dx64), // %4
"+r"(tmp64), // %5 "=&r"(tmp64), // %5
"+r"(src_tmp) // %6 "+r"(src_tmp) // %6
: :
: "memory", "cc", "v0", "v1" : "memory", "cc", "v0", "v1"
); );

View File

@ -16,9 +16,8 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// This module is for Visual C x86. // This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9 // Offsets for source bytes 0 to 9
static uvec8 kShuf0 = static uvec8 kShuf0 =
@ -96,8 +95,8 @@ static uvec16 kScaleAb2 =
// Reads 32 pixels, throws half away and writes 16 pixels. // Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked) __declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
mov eax, [esp + 4] // src_ptr mov eax, [esp + 4] // src_ptr
// src_stride ignored // src_stride ignored
@ -122,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x1 rectangle to 16x1. // Blends 32x1 rectangle to 16x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
mov eax, [esp + 4] // src_ptr mov eax, [esp + 4] // src_ptr
// src_stride // src_stride
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pmaddubsw xmm0, xmm4 // horizontal add
movdqa xmm2, xmm0 // average columns (32 to 16 pixels) pmaddubsw xmm1, xmm4
psrlw xmm0, 8 pavgw xmm0, xmm5 // (x + 1) / 2
movdqa xmm3, xmm1 pavgw xmm1, xmm5
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
@ -158,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x2 rectangle to 16x1. // Blends 32x2 rectangle to 16x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_ptr mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8 pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
@ -175,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
movdqu xmm2, [eax + esi] movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16] movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows pmaddubsw xmm0, xmm4 // horizontal add
pavgb xmm1, xmm3 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
movdqa xmm2, xmm0 // average columns (32 to 16 pixels) pmaddubsw xmm3, xmm4
psrlw xmm0, 8 paddw xmm0, xmm2 // vertical add
movdqa xmm3, xmm1 paddw xmm1, xmm3
psrlw xmm1, 8 psrlw xmm0, 1
pand xmm2, xmm5 psrlw xmm1, 1
pand xmm3, xmm5 pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm0, xmm2 pavgw xmm1, xmm5
pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
@ -246,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5 vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32 sub ecx, 32
@ -264,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1. // Blends 64x2 rectangle to 32x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -281,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpxor ymm5, ymm5, ymm5 // constant 0 vpxor ymm5, ymm5, ymm5 // constant 0
wloop: wloop:
vmovdqu ymm0, [eax] // average rows vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi] vmovdqu ymm2, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32] vmovdqu ymm3, [eax + esi + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add
vpaddw ymm1, ymm1, ymm3
vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
vpsrlw ymm1, ymm1, 1
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5 vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32 sub ecx, 32
@ -308,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// Point samples 32 pixels to 8 pixels. // Point samples 32 pixels to 8 pixels.
__declspec(naked) __declspec(naked)
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
mov eax, [esp + 4] // src_ptr mov eax, [esp + 4] // src_ptr
@ -339,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x4 rectangle to 8x1. // Blends 32x4 rectangle to 8x1.
__declspec(naked) __declspec(naked)
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
push esi push esi
@ -349,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3 lea edi, [esi + esi * 2] // src_stride * 3
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm7, 8 psrlw xmm4, 15
movdqa xmm5, xmm4
packuswb xmm4, xmm4
psllw xmm5, 3 // constant 0x0008
wloop: wloop:
movdqu xmm0, [eax] // average rows movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi] movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16] movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 pmaddubsw xmm0, xmm4 // horizontal add
pavgb xmm1, xmm3 pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // vertical add rows 0, 1
paddw xmm1, xmm3
movdqu xmm2, [eax + esi * 2] movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16] movdqu xmm3, [eax + esi * 2 + 16]
movdqu xmm4, [eax + edi] pmaddubsw xmm2, xmm4
movdqu xmm5, [eax + edi + 16] pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // add row 2
paddw xmm1, xmm3
movdqu xmm2, [eax + edi]
movdqu xmm3, [eax + edi + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
pavgb xmm2, xmm4 pmaddubsw xmm2, xmm4
pavgb xmm3, xmm5 pmaddubsw xmm3, xmm4
pavgb xmm0, xmm2 paddw xmm0, xmm2 // add row 3
pavgb xmm1, xmm3 paddw xmm1, xmm3
phaddw xmm0, xmm1
movdqa xmm2, xmm0 // average columns (32 to 16 pixels) paddw xmm0, xmm5 // + 8 for round
psrlw xmm0, 8 psrlw xmm0, 4 // /16 for average of 4 * 4
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm7
pand xmm3, xmm7
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
psrlw xmm0, 8
pand xmm2, xmm7
pavgw xmm0, xmm2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8 sub ecx, 8
@ -443,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3 lea edi, [esi + esi * 2] // src_stride * 3
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
vpsrlw ymm7, ymm7, 8 vpsrlw ymm4, ymm4, 15
vpsllw ymm5, ymm4, 3 // constant 0x0008
vpackuswb ymm4, ymm4, ymm4
wloop: wloop:
vmovdqu ymm0, [eax] // average rows vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32] vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi] vmovdqu ymm2, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32] vmovdqu ymm3, [eax + esi + 32]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
vpaddw ymm1, ymm1, ymm3
vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32] vmovdqu ymm3, [eax + esi * 2 + 32]
vpavgb ymm2, ymm2, [eax + edi] vpmaddubsw ymm2, ymm2, ymm4
vpavgb ymm3, ymm3, [eax + edi + 32] vpmaddubsw ymm3, ymm3, ymm4
lea eax, [eax + 64] vpaddw ymm0, ymm0, ymm2 // add row 2
vpavgb ymm0, ymm0, ymm2 vpaddw ymm1, ymm1, ymm3
vpavgb ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + edi]
vmovdqu ymm3, [eax + edi + 32]
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) lea eax, [eax + 64]
vpand ymm3, ymm1, ymm7 vpmaddubsw ymm2, ymm2, ymm4
vpsrlw ymm0, ymm0, 8 vpmaddubsw ymm3, ymm3, ymm4
vpsrlw ymm1, ymm1, 8 vpaddw ymm0, ymm0, ymm2 // add row 3
vpavgw ymm0, ymm0, ymm2 vpaddw ymm1, ymm1, ymm3
vpavgw ymm1, ymm1, ymm3 vphaddw ymm0, ymm0, ymm1 // mutates
vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpaddw ymm0, ymm0, ymm5 // + 8 for round
vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
vpsrlw ymm0, ymm0, 8
vpavgw ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm0 vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0 vmovdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16 sub ecx, 16
@ -499,9 +502,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored // src_stride ignored
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
movdqa xmm3, kShuf0 movdqa xmm3, xmmword ptr kShuf0
movdqa xmm4, kShuf1 movdqa xmm4, xmmword ptr kShuf1
movdqa xmm5, kShuf2 movdqa xmm5, xmmword ptr kShuf2
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax]
@ -548,12 +551,12 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, kShuf01 movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, kShuf11 movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, kShuf21 movdqa xmm4, xmmword ptr kShuf21
movdqa xmm5, kMadd01 movdqa xmm5, xmmword ptr kMadd01
movdqa xmm6, kMadd11 movdqa xmm6, xmmword ptr kMadd11
movdqa xmm7, kRound34 movdqa xmm7, xmmword ptr kRound34
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
@ -579,7 +582,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
lea eax, [eax + 32] lea eax, [eax + 32]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm4 pshufb xmm0, xmm4
movdqa xmm1, kMadd21 movdqa xmm1, xmmword ptr kMadd21
pmaddubsw xmm0, xmm1 pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
@ -605,12 +608,12 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, kShuf01 movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, kShuf11 movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, kShuf21 movdqa xmm4, xmmword ptr kShuf21
movdqa xmm5, kMadd01 movdqa xmm5, xmmword ptr kMadd01
movdqa xmm6, kMadd11 movdqa xmm6, xmmword ptr kMadd11
movdqa xmm7, kRound34 movdqa xmm7, xmmword ptr kRound34
wloop: wloop:
movdqu xmm0, [eax] // pixels 0..7 movdqu xmm0, [eax] // pixels 0..7
@ -639,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
pavgb xmm1, xmm0 pavgb xmm1, xmm0
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pshufb xmm0, xmm4 pshufb xmm0, xmm4
movdqa xmm1, kMadd21 movdqa xmm1, xmmword ptr kMadd21
pmaddubsw xmm0, xmm1 pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
@ -665,8 +668,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored // src_stride ignored
mov edx, [esp + 12] // dst_ptr mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width mov ecx, [esp + 16] // dst_width
movdqa xmm4, kShuf38a movdqa xmm4, xmmword ptr kShuf38a
movdqa xmm5, kShuf38b movdqa xmm5, xmmword ptr kShuf38b
xloop: xloop:
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
@ -698,9 +701,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, kShufAc movdqa xmm2, xmmword ptr kShufAc
movdqa xmm3, kShufAc3 movdqa xmm3, xmmword ptr kShufAc3
movdqa xmm4, kScaleAc33 movdqa xmm4, xmmword ptr kScaleAc33
pxor xmm5, xmm5 pxor xmm5, xmm5
xloop: xloop:
@ -763,10 +766,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, kShufAb0 movdqa xmm2, xmmword ptr kShufAb0
movdqa xmm3, kShufAb1 movdqa xmm3, xmmword ptr kShufAb1
movdqa xmm4, kShufAb2 movdqa xmm4, xmmword ptr kShufAb2
movdqa xmm5, kScaleAb2 movdqa xmm5, xmmword ptr kScaleAb2
xloop: xloop:
movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm0, [eax] // average 2 rows into xmm0
@ -857,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
} }
#endif // HAS_SCALEADDROW_AVX2 #endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version. // Bilinear column filtering. SSSE3 version.
__declspec(naked) __declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@ -874,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm5, eax movd xmm5, eax
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9 psrlw xmm6, 9
pcmpeqb xmm7, xmm7 // generate 0x0001
psrlw xmm7, 15
pextrw eax, xmm2, 1 // get x0 integer. preroll pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2 sub ecx, 2
jl xloop29 jl xloop29
@ -896,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm4, ebx movd xmm4, ebx
pshufb xmm1, xmm5 // 0011 pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4 punpcklwd xmm0, xmm4
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm1, xmm6 // 0..7f and 7f..0 pxor xmm1, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
packuswb xmm0, xmm0 // 8 bits, 2 pixels. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
movd ebx, xmm0 packuswb xmm1, xmm1 // 8 bits, 2 pixels.
movd ebx, xmm1
mov [edi], bx mov [edi], bx
lea edi, [edi + 2] lea edi, [edi + 2]
sub ecx, 2 // 2 pixels sub ecx, 2 // 2 pixels
jge xloop2 jge xloop2
xloop29: xloop29:
add ecx, 2 - 1 add ecx, 2 - 1
jl xloop99 jl xloop99
@ -918,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm0, ebx movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions. psrlw xmm2, 9 // 7 bit fractions.
pshufb xmm2, xmm5 // 0011 pshufb xmm2, xmm5 // 0011
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm2, xmm6 // 0..7f and 7f..0 pxor xmm2, xmm6 // 0..7f and 7f..0
pmaddubsw xmm0, xmm2 // 16 bit paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. pmaddubsw xmm2, xmm0 // 16 bit
packuswb xmm0, xmm0 // 8 bits paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
movd ebx, xmm0 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm2, xmm2 // 8 bits
movd ebx, xmm2
mov [edi], bl mov [edi], bl
xloop99: xloop99:
@ -1233,8 +1253,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
mov ecx, [esp + 8 + 12] // dst_width mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx movd xmm3, [esp + 8 + 20] // dx
movdqa xmm4, kShuffleColARGB movdqa xmm4, xmmword ptr kShuffleColARGB
movdqa xmm5, kShuffleFractions movdqa xmm5, xmmword ptr kShuffleFractions
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9 psrlw xmm6, 9
pextrw eax, xmm2, 1 // get x0 integer. preroll pextrw eax, xmm2, 1 // get x0 integer. preroll

View File

@ -25,6 +25,7 @@ struct FourCCAliasEntry {
static const struct FourCCAliasEntry kFourCCAliases[] = { static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420}, {FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422}, {FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444}, {FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2}, {FOURCC_YUYV, FOURCC_YUY2},

File diff suppressed because it is too large Load Diff