Revert "libyuv: update to b8ddb5a2"

This reverts commit b8f83282f8.

Update was to wrong version and still has: 

BUG=webm:1252

Change-Id: I80f3a7c0581ab5e2dd1a84f7840e51d7c362afac
This commit is contained in:
James Bankoski
2016-06-29 23:05:51 +00:00
parent b8f83282f8
commit 291033032e
55 changed files with 8255 additions and 7042 deletions

View File

@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: b8ddb5a2
Version: 1456
License: BSD
License File: LICENSE
@@ -13,13 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
Remove files unnecessary to libvpx build.
rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \
LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \
all.gyp build_overrides/ chromium/ codereview.settings docs/ \
download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \
include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \
libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \
third_party/ tools/ unit_test/ util/ winarm.mk

View File

@@ -12,8 +12,10 @@
#define INCLUDE_LIBYUV_CONVERT_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): Remove the following headers includes.
#include "libyuv/convert_from.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#ifdef __cplusplus
namespace libyuv {

View File

@@ -12,8 +12,10 @@
#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
#include "libyuv/basic_types.h"
#include "libyuv/rotate.h" // For enum RotationMode.
// TODO(fbarchard): Remove the following headers includes
#include "libyuv/convert_from.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
// TODO(fbarchard): This set of functions should exactly match convert.h
// TODO(fbarchard): Add tests. Create random content of right size and convert
@@ -58,22 +60,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert I411 to ARGB.
LIBYUV_API
int I411ToARGB(const uint8* src_y, int src_stride_y,
@@ -82,24 +68,6 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate);
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate);
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
@@ -163,54 +131,6 @@ int J422ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,

View File

@@ -56,6 +56,8 @@ int I400Copy(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
// TODO(fbarchard): I420ToM420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,

View File

@@ -18,8 +18,9 @@ namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
static const int kCpuInitialized = 0x1;
#define kCpuInit 0x1
// These flags are only valid on ARM processors.
static const int kCpuHasARM = 0x2;
@@ -36,12 +37,12 @@ static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
static const int kCpuHasAVX3 = 0x2000;
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x10000;
static const int kCpuHasDSPR2 = 0x20000;
static const int kCpuHasMIPS_DSP = 0x20000;
static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
@@ -56,13 +57,13 @@ int ArmCpuCaps(const char* cpuinfo_name);
// returns non-zero if instruction set is detected
static __inline int TestCpuFlag(int test_flag) {
LIBYUV_API extern int cpu_info_;
return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
}
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(1) to disable all cpu specific optimizations.
// MaskCpuFlags(0) to disable all cpu specific optimizations.
LIBYUV_API
void MaskCpuFlags(int enable_flags);

View File

@@ -145,6 +145,13 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert NV21 to RGB565.
LIBYUV_API
int NV21ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// I422ToARGB is in convert_argb.h
// Convert I422 to BGRA.
LIBYUV_API
@@ -170,14 +177,6 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height);
// Alias
#define RGB24ToRAW RAWToRGB24
LIBYUV_API
int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height);
// Draw a rectangle into I420.
LIBYUV_API
int I420Rect(uint8* dst_y, int dst_stride_y,
@@ -282,19 +281,13 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Copy Alpha channel of ARGB to alpha of ARGB.
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Extract the alpha channel from ARGB.
LIBYUV_API
int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
uint8* dst_a, int dst_stride_a,
int width, int height);
// Copy Y channel to Alpha of ARGB.
// Copy ARGB to ARGB.
LIBYUV_API
int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
@@ -308,7 +301,6 @@ LIBYUV_API
ARGBBlendRow GetARGBBlend();
// Alpha Blend ARGB images and store to destination.
// Source is pre-multiplied by alpha using ARGBAttenuate.
// Alpha of destination is set to 255.
LIBYUV_API
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
@@ -316,31 +308,6 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alpha Blend plane and store to destination.
// Source is not pre-multiplied by alpha.
LIBYUV_API
int BlendPlane(const uint8* src_y0, int src_stride_y0,
const uint8* src_y1, int src_stride_y1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Alpha Blend YUV images and store to destination.
// Source is not pre-multiplied by alpha.
// Alpha is full width x height and subsampled to half size to apply to UV.
LIBYUV_API
int I420Blend(const uint8* src_y0, int src_stride_y0,
const uint8* src_u0, int src_stride_u0,
const uint8* src_v0, int src_stride_v0,
const uint8* src_y1, int src_stride_y1,
const uint8* src_u1, int src_stride_u1,
const uint8* src_v1, int src_stride_v1,
const uint8* alpha, int alpha_stride,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
LIBYUV_API
int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
@@ -390,6 +357,12 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert MJPG to ARGB.
LIBYUV_API
int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* argb, int argb_stride,
int w, int h, int dw, int dh);
// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
@@ -416,49 +389,22 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height, uint32 value);
// Interpolate between two images using specified amount of interpolation
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
// and 255 means 1% src0 and 99% src1.
LIBYUV_API
int InterpolatePlane(const uint8* src0, int src_stride0,
const uint8* src1, int src_stride1,
uint8* dst, int dst_stride,
int width, int height, int interpolation);
// Interpolate between two ARGB images using specified amount of interpolation
// Internally calls InterpolatePlane with width * 4 (bpp).
// (0 to 255) and store to destination.
// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
// and 255 means 1% src_argb0 and 99% src_argb1.
// Internally uses ARGBScale bilinear filtering.
// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
// Interpolate between two YUV images using specified amount of interpolation
// Internally calls InterpolatePlane on each plane where the U and V planes
// are half width and half height.
LIBYUV_API
int I420Interpolate(const uint8* src0_y, int src0_stride_y,
const uint8* src0_u, int src0_stride_u,
const uint8* src0_v, int src0_stride_v,
const uint8* src1_y, int src1_stride_y,
const uint8* src1_u, int src1_stride_u,
const uint8* src1_v, int src1_stride_v,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

View File

@@ -22,24 +22,53 @@ extern "C" {
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif
#endif
// The following are available for Visual C and clangcl 32 bit:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
// The following are available for GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
// The following are available for 32 bit GCC:
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for 64 bit GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
@@ -56,8 +85,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_DSPR2
#define HAS_TRANSPOSEUVWX8_DSPR2
#define HAS_TRANSPOSEWX8_MIPS_DSPR2
#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
@@ -71,9 +100,7 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
@@ -82,7 +109,7 @@ void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
@@ -99,17 +126,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);

File diff suppressed because it is too large Load Diff

View File

@@ -35,6 +35,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering);
// TODO(fbarchard): Implement this.
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

View File

@@ -23,26 +23,6 @@ extern "C" {
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
#define GCC_HAS_AVX2 1
#endif // GNUC >= 4.7
#endif // __GNUC__
// clang >= 3.4.0 required for AVX2.
#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
#define CLANG_HAS_AVX2 1
#endif // clang >= 3.4
#endif // __clang__
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
@@ -62,23 +42,24 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSSE3
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3
#define HAS_SCALEADDROW_SSE2
#define HAS_SCALEROWDOWN4_SSE2
#endif
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
// The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@@ -96,10 +77,10 @@ extern "C" {
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_SCALEROWDOWN2_DSPR2
#define HAS_SCALEROWDOWN4_DSPR2
#define HAS_SCALEROWDOWN34_DSPR2
#define HAS_SCALEROWDOWN38_DSPR2
#define HAS_SCALEROWDOWN2_MIPS_DSPR2
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
#endif
// Scale ARGB vertically with bilinear interpolation.
@@ -152,8 +133,6 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width);
void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -235,11 +214,11 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
@@ -247,9 +226,9 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
@@ -272,13 +251,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
@@ -286,11 +263,9 @@ void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
@@ -443,8 +418,6 @@ void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -474,25 +447,27 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width);
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
#ifdef __cplusplus

View File

@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1601
#define LIBYUV_VERSION 1456
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@@ -90,8 +90,7 @@ enum FourCC {
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
FOURCC_J400 = FOURCC('J', '4', '0', '0'),
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -151,7 +150,6 @@ enum FourCCBpp {
FOURCC_BPP_YU12 = 12,
FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,

View File

@@ -17,7 +17,6 @@
#endif
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
@@ -27,13 +26,30 @@ namespace libyuv {
extern "C" {
#endif
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#ifdef VISUALC_HAS_AVX2
#define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended.
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
HashDjb2_C;
uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@@ -111,6 +127,23 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
return fourcc;
}
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
#ifdef VISUALC_HAS_AVX2
#define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,

View File

@@ -10,8 +10,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {

View File

@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -18,13 +16,11 @@ namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
asm volatile ( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
@@ -58,10 +54,15 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"+r"(count), // %2
"=g"(sse) // %3
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
); // NOLINT
return sse;
}
#endif // defined(__x86_64__) || defined(__i386__)
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
@@ -90,7 +91,7 @@ static uvec32 kHashMul3 = {
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
uint32 hash;
asm volatile (
asm volatile ( // NOLINT
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
@@ -139,7 +140,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"m"(kHashMul3) // %8
: "memory", "cc"
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
); // NOLINT
return hash;
}
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

View File

@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -29,6 +27,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n"

View File

@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -28,6 +26,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"

View File

@@ -9,8 +9,6 @@
*/
#include "libyuv/basic_types.h"
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -18,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
@@ -101,32 +100,41 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
}
#endif // _MSC_VER >= 1700
uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
uvec32 kHashMul0 = {
#define HAS_HASHDJB2_SSE41
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
static uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
uvec32 kHashMul1 = {
static uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
uvec32 kHashMul2 = {
static uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
uvec32 kHashMul3 = {
static uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
@@ -135,30 +143,30 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, xmmword ptr kHash16x33
movdqa xmm6, kHash16x33
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
movdqa xmm5, xmmword ptr kHashMul0
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
movdqa xmm5, kHashMul0
movdqa xmm2, xmm1
punpcklbw xmm2, xmm7 // src[0-7]
movdqa xmm3, xmm2
punpcklwd xmm3, xmm7 // src[0-3]
pmulld xmm3, xmm5
movdqa xmm5, xmmword ptr kHashMul1
pmulld(0xdd) // pmulld xmm3, xmm5
movdqa xmm5, kHashMul1
movdqa xmm4, xmm2
punpckhwd xmm4, xmm7 // src[4-7]
pmulld xmm4, xmm5
movdqa xmm5, xmmword ptr kHashMul2
pmulld(0xe5) // pmulld xmm4, xmm5
movdqa xmm5, kHashMul2
punpckhbw xmm1, xmm7 // src[8-15]
movdqa xmm2, xmm1
punpcklwd xmm2, xmm7 // src[8-11]
pmulld xmm2, xmm5
movdqa xmm5, xmmword ptr kHashMul3
pmulld(0xd5) // pmulld xmm2, xmm5
movdqa xmm5, kHashMul3
punpckhwd xmm1, xmm7 // src[12-15]
pmulld xmm1, xmm5
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
@@ -183,37 +191,36 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
vmovd xmm0, [esp + 12] // seed
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
wloop:
vpmovzxbd xmm3, [eax] // src[0-3]
vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
vpmovzxbd xmm4, [eax + 4] // src[4-7]
vpmulld xmm3, xmm3, xmmword ptr kHashMul0
vpmovzxbd xmm2, [eax + 8] // src[8-11]
vpmulld xmm4, xmm4, xmmword ptr kHashMul1
vpmovzxbd xmm1, [eax + 12] // src[12-15]
vpmulld xmm2, xmm2, xmmword ptr kHashMul2
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
pmulld xmm3, kHashMul0
vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
pmulld xmm4, kHashMul1
vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
pmulld xmm2, kHashMul2
lea eax, [eax + 16]
vpmulld xmm1, xmm1, xmmword ptr kHashMul3
vpaddd xmm3, xmm3, xmm4 // add 16 results
vpaddd xmm1, xmm1, xmm2
vpaddd xmm1, xmm1, xmm3
vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
vpaddd xmm1, xmm1,xmm2
vpshufd xmm2, xmm1, 0x01
vpaddd xmm1, xmm1, xmm2
vpaddd xmm0, xmm0, xmm1
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
vmovd eax, xmm0 // return hash
vzeroupper
movd eax, xmm0 // return hash
ret
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus

View File

@@ -245,8 +245,8 @@ static int X420ToI420(const uint8* src_y,
int y;
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) = SplitUVRow_C;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
if (!src_y || !src_uv ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
@@ -303,14 +303,14 @@ static int X420ToI420(const uint8* src_y,
}
}
#endif
#if defined(HAS_SPLITUVROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
SplitUVRow = SplitUVRow_Any_DSPR2;
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_DSPR2;
SplitUVRow = SplitUVRow_MIPS_DSPR2;
}
}
#endif
@@ -390,9 +390,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
int width, int height) {
int y;
void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
void (*YUY2ToYRow)(const uint8* src_yuy2,
uint8* dst_y, int width) = YUY2ToYRow_C;
uint8* dst_y, int pix) = YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -455,9 +455,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
int width, int height) {
int y;
void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
void (*UYVYToYRow)(const uint8* src_uyvy,
uint8* dst_y, int width) = UYVYToYRow_C;
uint8* dst_y, int pix) = UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -521,7 +521,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
int y;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
@@ -597,7 +597,7 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
int y;
void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
BGRAToYRow_C;
if (!src_bgra ||
!dst_y || !dst_u || !dst_v ||
@@ -663,7 +663,7 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
int y;
void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
ABGRToYRow_C;
if (!src_abgr ||
!dst_y || !dst_u || !dst_v ||
@@ -729,7 +729,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
int y;
void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
RGBAToYRow_C;
if (!src_rgba ||
!dst_y || !dst_u || !dst_v ||
@@ -796,14 +796,14 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
#if defined(HAS_RGB24TOYROW_NEON)
void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
RGB24ToYRow_C;
#else
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB24ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
@@ -910,14 +910,14 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
#if defined(HAS_RAWTOYROW_NEON)
void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
RAWToYRow_C;
#else
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RAWToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_raw || !dst_y || !dst_u || !dst_v ||
@@ -1024,14 +1024,14 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
#if defined(HAS_RGB565TOYROW_NEON)
void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
RGB565ToYRow_C;
#else
void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB565ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
@@ -1146,14 +1146,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#if defined(HAS_ARGB1555TOYROW_NEON)
void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
ARGB1555ToYRow_C;
#else
void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
ARGB1555ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
@@ -1270,14 +1270,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
#if defined(HAS_ARGB4444TOYROW_NEON)
void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
ARGB4444ToYRow_C;
#else
void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
ARGB4444ToARGBRow_C;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
#endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||

View File

@@ -14,7 +14,6 @@
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
#include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle.
#include "libyuv/rotate_argb.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
@@ -45,347 +44,18 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Convert I422 to ARGB with matrix
static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
// Convert I444 to ARGB.
LIBYUV_API
int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height);
}
// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
}
// Convert J420 to ABGR.
LIBYUV_API
int J420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H420 to ARGB.
LIBYUV_API
int H420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvH709Constants,
width, height);
}
// Convert H420 to ABGR.
LIBYUV_API
int H420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I420ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I422 to ARGB with matrix
static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
// Convert I422 to ARGB.
LIBYUV_API
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height);
}
// Convert I422 to ABGR.
LIBYUV_API
int I422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
}
// Convert J422 to ABGR.
LIBYUV_API
int J422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H422 to ARGB.
LIBYUV_API
int H422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvH709Constants,
width, height);
}
// Convert H422 to ABGR.
LIBYUV_API
int H422ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I422ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuH709Constants, // Use Yvu matrix
width, height);
}
// Convert I444 to ARGB with matrix
static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I444ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
@@ -433,7 +103,7 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -442,49 +112,81 @@ static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I444 to ARGB.
// Convert I422 to ARGB.
LIBYUV_API
int I444ToARGB(const uint8* src_y, int src_stride_y,
int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height);
}
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
// Convert I444 to ABGR.
LIBYUV_API
int I444ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert J444 to ARGB.
LIBYUV_API
int J444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
return I444ToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_argb, dst_stride_argb,
&kYuvJPEGConstants,
width, height);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
// Convert I411 to ARGB.
@@ -499,7 +201,6 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I411ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
@@ -547,7 +248,7 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
@@ -556,143 +257,6 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I420 with Alpha to preattenuated ARGB.
static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width, int height, int attenuate) {
int y;
void (*I422AlphaToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
const uint8* a_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) = I422AlphaToARGBRow_C;
void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBAttenuateRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
if (attenuate) {
ARGBAttenuateRow(dst_argb, dst_argb, width);
}
dst_argb += dst_stride_argb;
src_a += src_stride_a;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 with Alpha to ARGB.
LIBYUV_API
int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int attenuate) {
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
src_a, src_stride_a,
dst_argb, dst_stride_argb,
&kYuvI601Constants,
width, height, attenuate);
}
// Convert I420 with Alpha to ABGR.
LIBYUV_API
int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
const uint8* src_a, int src_stride_a,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height, int attenuate) {
return I420AlphaToARGBMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
src_a, src_stride_a,
dst_abgr, dst_stride_abgr,
&kYvuI601Constants, // Use Yvu matrix
width, height, attenuate);
}
// Convert I400 to ARGB.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
@@ -758,7 +322,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
J400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
@@ -885,7 +449,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB24ToARGBRow_C;
if (!src_rgb24 || !dst_argb ||
width <= 0 || height == 0) {
@@ -935,7 +499,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RAWToARGBRow_C;
if (!src_raw || !dst_argb ||
width <= 0 || height == 0) {
@@ -985,7 +549,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
RGB565ToARGBRow_C;
if (!src_rgb565 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1044,7 +608,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
int width, int height) {
int y;
void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
int width) = ARGB1555ToARGBRow_C;
int pix) = ARGB1555ToARGBRow_C;
if (!src_argb1555 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -1102,7 +666,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
int width, int height) {
int y;
void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
int width) = ARGB4444ToARGBRow_C;
int pix) = ARGB4444ToARGBRow_C;
if (!src_argb4444 || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@@ -1163,7 +727,6 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
@@ -1201,7 +764,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
NV12ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -1221,7 +784,6 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
void (*NV21ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV21ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb ||
width <= 0 || height == 0) {
@@ -1259,7 +821,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
NV21ToARGBRow(src_y, src_uv, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
@@ -1278,7 +840,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
void (*NV12ToARGBRow)(const uint8* y_buf,
const uint8* uv_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = NV12ToARGBRow_C;
if (!src_m420 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1316,16 +877,14 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
#endif
for (y = 0; y < height - 1; y += 2) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
&kYuvI601Constants, width);
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
dst_argb + dst_stride_argb, &kYuvI601Constants, width);
dst_argb + dst_stride_argb, width);
dst_argb += dst_stride_argb * 2;
src_m420 += src_stride_m420 * 3;
}
if (height & 1) {
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
&kYuvI601Constants, width);
NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
}
return 0;
}
@@ -1336,10 +895,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*YUY2ToARGBRow)(const uint8* src_yuy2,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) =
void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
YUY2ToARGBRow_C;
if (!src_yuy2 || !dst_argb ||
width <= 0 || height == 0) {
@@ -1383,7 +939,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
for (y = 0; y < height; ++y) {
YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
YUY2ToARGBRow(src_yuy2, dst_argb, width);
src_yuy2 += src_stride_yuy2;
dst_argb += dst_stride_argb;
}
@@ -1396,10 +952,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*UYVYToARGBRow)(const uint8* src_uyvy,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) =
void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
UYVYToARGBRow_C;
if (!src_uyvy || !dst_argb ||
width <= 0 || height == 0) {
@@ -1443,13 +996,159 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
for (y = 0; y < height; ++y) {
UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
UYVYToARGBRow(src_uyvy, dst_argb, width);
src_uyvy += src_stride_uyvy;
dst_argb += dst_stride_argb;
}
return 0;
}
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@@ -445,24 +445,221 @@ int I420ToNV21(const uint8* src_y, int src_stride_y,
return I420ToNV12(src_y, src_stride_y,
src_v, src_stride_v,
src_u, src_stride_u,
dst_y, dst_stride_y,
dst_y, src_stride_y,
dst_vu, dst_stride_vu,
width, height);
}
// Convert I422 to RGBA with matrix
static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
// Convert I420 to ARGB.
LIBYUV_API
int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to BGRA.
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
int y;
void (*I422ToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToBGRARow_C;
if (!src_y || !src_u || !src_v || !dst_bgra ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
dst_stride_bgra = -dst_stride_bgra;
}
#if defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToBGRARow = I422ToBGRARow_AVX2;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
dst_bgra += dst_stride_bgra;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to ABGR.
LIBYUV_API
int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_abgr, int dst_stride_abgr,
int width, int height) {
int y;
void (*I422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToABGRRow_C;
if (!src_y || !src_u || !src_v || !dst_abgr ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToABGRRow = I422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToABGRRow = I422ToABGRRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
dst_abgr += dst_stride_abgr;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to RGBA.
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToRGBARow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba ||
width <= 0 || height == 0) {
@@ -498,18 +695,9 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I422TORGBAROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
I422ToRGBARow = I422ToRGBARow_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
dst_rgba += dst_stride_rgba;
src_y += src_stride_y;
if (y & 1) {
@@ -520,49 +708,18 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I420 to RGBA.
// Convert I420 to RGB24.
LIBYUV_API
int I420ToRGBA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgba, int dst_stride_rgba,
int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgba, dst_stride_rgba,
&kYuvI601Constants,
width, height);
}
// Convert I420 to BGRA.
LIBYUV_API
int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bgra, int dst_stride_bgra,
int width, int height) {
return I420ToRGBAMatrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_bgra, dst_stride_bgra,
&kYvuI601Constants, // Use Yvu matrix
width, height);
}
// Convert I420 to RGB24 with matrix
static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
const struct YuvConstants* yuvconstants,
int width, int height) {
int y;
void (*I422ToRGB24Row)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 ||
width <= 0 || height == 0) {
@@ -600,7 +757,7 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
dst_rgb24 += dst_stride_rgb24;
src_y += src_stride_y;
if (y & 1) {
@@ -611,21 +768,6 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
return 0;
}
// Convert I420 to RGB24.
LIBYUV_API
int I420ToRGB24(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
return I420ToRGB24Matrix(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
dst_rgb24, dst_stride_rgb24,
&kYuvI601Constants,
width, height);
}
// Convert I420 to RAW.
LIBYUV_API
int I420ToRAW(const uint8* src_y, int src_stride_y,
@@ -633,12 +775,57 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
return I420ToRGB24Matrix(src_y, src_stride_y,
src_v, src_stride_v, // Swap U and V
src_u, src_stride_u,
dst_raw, dst_stride_raw,
&kYvuI601Constants, // Use Yvu matrix
width, height);
int y;
void (*I422ToRAWRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToRAWRow_C;
if (!src_y || !src_u || !src_v || !dst_raw ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_raw = dst_raw + (height - 1) * dst_stride_raw;
dst_stride_raw = -dst_stride_raw;
}
#if defined(HAS_I422TORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRAWRow = I422ToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRAWRow = I422ToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
dst_raw += dst_stride_raw;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert I420 to ARGB1555.
@@ -653,7 +840,6 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C;
if (!src_y || !src_u || !src_v || !dst_argb1555 ||
width <= 0 || height == 0) {
@@ -691,8 +877,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
width);
I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
dst_argb1555 += dst_stride_argb1555;
src_y += src_stride_y;
if (y & 1) {
@@ -716,7 +901,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C;
if (!src_y || !src_u || !src_v || !dst_argb4444 ||
width <= 0 || height == 0) {
@@ -754,8 +938,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
width);
I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
dst_argb4444 += dst_stride_argb4444;
src_y += src_stride_y;
if (y & 1) {
@@ -778,7 +961,6 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
@@ -816,7 +998,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
#endif
for (y = 0; y < height; ++y) {
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -847,10 +1029,9 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
@@ -888,12 +1069,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@@ -924,7 +1105,7 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
dst_rgb565 += dst_stride_rgb565;
@@ -1077,6 +1258,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
// Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
int halfwidth = (width + 1) / 2;
int halfheight = (height + 1) / 2;

View File

@@ -28,10 +28,10 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) = ARGBToUV444Row_C;
int pix) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
@@ -109,16 +109,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -133,22 +130,34 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -161,17 +170,9 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToUV422Row(src_argb, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_y += dst_stride_y;
@@ -190,8 +191,8 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
int width, int height) {
int y;
void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int width) = ARGBToUV411Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
int pix) = ARGBToUV411Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -263,7 +264,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
@@ -372,7 +373,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
int halfwidth = (width + 1) >> 1;
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
@@ -477,9 +478,9 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
@@ -501,22 +502,34 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -529,14 +542,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -561,7 +567,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
ARGBToUV422Row(src_argb, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width);
I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
src_argb += src_stride_argb;
@@ -579,9 +585,9 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
int y;
void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
@@ -603,22 +609,34 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
@@ -631,14 +649,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -663,7 +674,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
ARGBToUV422Row(src_argb, row_u, row_v, width);
ARGBToYRow(src_argb, row_y, width);
I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
src_argb += src_stride_argb;
@@ -681,7 +692,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height) {
int y;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
if (!src_argb || !dst_y || width <= 0 || height == 0) {
return -1;
@@ -753,7 +764,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb24, int dst_stride_rgb24,
int width, int height) {
int y;
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB24Row_C;
if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
@@ -801,7 +812,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
uint8* dst_raw, int dst_stride_raw,
int width, int height) {
int y;
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRAWRow_C;
if (!src_argb || !dst_raw || width <= 0 || height == 0) {
return -1;
@@ -858,7 +869,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
const uint8* dither4x4, int width, int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@@ -910,7 +921,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height) {
int y;
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -966,7 +977,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb1555, int dst_stride_argb1555,
int width, int height) {
int y;
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB1555Row_C;
if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
return -1;
@@ -1022,7 +1033,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb4444, int dst_stride_argb4444,
int width, int height) {
int y;
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB4444Row_C;
if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
return -1;
@@ -1082,7 +1093,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
@@ -1146,24 +1157,21 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Convert ARGB to J422. (JPeg full range I422).
// ARGB little endian (bgra in memory) to J422
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUVJ422Row_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYJRow_C;
if (!src_argb ||
!dst_yj || !dst_u || !dst_v ||
width <= 0 || height == 0) {
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
@@ -1171,19 +1179,34 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_yj == width &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJ422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
@@ -1204,20 +1227,12 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_yj, width);
ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_yj += dst_stride_yj;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
@@ -1230,7 +1245,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
int width, int height) {
int y;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
if (!src_argb || !dst_yj || width <= 0 || height == 0) {
return -1;

View File

@@ -9,7 +9,6 @@
*/
#include "libyuv/convert.h"
#include "libyuv/convert_argb.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"

View File

@@ -23,7 +23,7 @@ namespace libyuv {
extern "C" {
#endif
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// src_width is used for source stride computation
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
@@ -51,8 +51,8 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
// also enable temporary buffer.
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
crop_argb == sample;
uint8* dest_argb = crop_argb;
int dest_argb_stride = argb_stride;
uint8* tmp_argb = crop_argb;
int tmp_argb_stride = argb_stride;
uint8* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
@@ -66,13 +66,13 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
}
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
int argb_size = crop_width * abs_crop_height * 4;
rotate_buffer = (uint8*)malloc(argb_size);
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
crop_argb = rotate_buffer;
argb_stride = crop_width * 4;
argb_stride = crop_width;
}
switch (format) {
@@ -176,6 +176,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
@@ -290,7 +291,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
if (need_buf) {
if (!r) {
r = ARGBRotate(crop_argb, argb_stride,
dest_argb, dest_argb_stride,
tmp_argb, tmp_argb_stride,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);

View File

@@ -39,13 +39,12 @@ int ConvertToI420(const uint8* sample,
int aligned_src_width = (src_width + 1) & ~1;
const uint8* src;
const uint8* src_uv;
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
format != FOURCC_NV12 && format != FOURCC_NV21 &&
format != FOURCC_YV12) || y == sample;
format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
uint8* tmp_y = y;
uint8* tmp_u = u;
uint8* tmp_v = v;
@@ -53,14 +52,16 @@ int ConvertToI420(const uint8* sample,
int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride;
uint8* rotate_buffer = NULL;
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (!y || !u || !v || !sample ||
src_width <= 0 || crop_width <= 0 ||
src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
inv_crop_height = -inv_crop_height;
}
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
@@ -213,6 +214,7 @@ int ConvertToI420(const uint8* sample,
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
case FOURCC_YV12: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;

View File

@@ -10,12 +10,12 @@
#include "libyuv/cpu_id.h"
#if defined(_MSC_VER)
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
@@ -36,8 +36,7 @@ extern "C" {
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
!defined(__clang__)
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
@@ -49,9 +48,9 @@ extern "C" {
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER)
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
// Visual C version uses intrinsic or inline x86 assembly.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
@@ -64,7 +63,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
mov [edi + 8], ecx
mov [edi + 12], edx
}
#else // Visual C but not x86
#else
if (info_ecx == 0) {
__cpuid((int*)(cpu_info), info_eax);
} else {
@@ -72,9 +71,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
uint32 info_ebx, info_edx;
asm volatile (
asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n"
@@ -90,7 +89,7 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
cpu_info[3] = info_edx;
#endif // defined(_MSC_VER)
#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
@@ -99,37 +98,28 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
}
#endif
// For VS2010 and earlier emit can be used:
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
// __asm {
// xor ecx, ecx // xcr 0
// xgetbv
// mov xcr0, eax
// }
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
// https://code.google.com/p/libyuv/issues/detail?id=529
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", off)
#endif
// TODO(fbarchard): Enable xgetbv when validator supports it.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return xcr0;
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
// Return optimization to previous setting.
#if defined(_M_IX86) && (_MSC_VER < 1900)
#pragma optimize("g", on)
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
@@ -161,9 +151,30 @@ int ArmCpuCaps(const char* cpuinfo_name) {
return 0;
}
#if defined(__mips__) && defined(__linux__)
static int MipsCpuCaps(const char* search_string) {
char cpuinfo_line[512];
const char* file_name = "/proc/cpuinfo";
FILE* f = fopen(file_name, "r");
if (!f) {
// Assume DSP if /proc/cpuinfo is unavailable.
// This will occur for Chrome sandbox for Pepper or Render process.
return kCpuHasMIPS_DSP;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
return kCpuHasMIPS_DSP;
}
}
fclose(f);
return 0;
}
#endif
// CPU detect function for SIMD instruction sets.
LIBYUV_API
int cpu_info_ = 0; // cpu_info is not initialized yet.
int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
@@ -186,9 +197,8 @@ static LIBYUV_BOOL TestEnv(const char*) {
LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
// TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
@@ -197,7 +207,7 @@ int InitCpuFlags(void) {
if (cpu_info0[0] >= 7) {
CpuId(7, 0, cpu_info7);
}
cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
@@ -206,57 +216,57 @@ int InitCpuFlags(void) {
kCpuHasX86;
#ifdef HAS_XGETBV
// AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
}
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
TestOsSaveYmm()) { // Saves YMM.
cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
kCpuHasAVX;
}
#endif
// Environment variable overrides for testing.
if (TestEnv("LIBYUV_DISABLE_X86")) {
cpu_info &= ~kCpuHasX86;
cpu_info_ &= ~kCpuHasX86;
}
if (TestEnv("LIBYUV_DISABLE_SSE2")) {
cpu_info &= ~kCpuHasSSE2;
cpu_info_ &= ~kCpuHasSSE2;
}
if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
cpu_info &= ~kCpuHasSSSE3;
cpu_info_ &= ~kCpuHasSSSE3;
}
if (TestEnv("LIBYUV_DISABLE_SSE41")) {
cpu_info &= ~kCpuHasSSE41;
cpu_info_ &= ~kCpuHasSSE41;
}
if (TestEnv("LIBYUV_DISABLE_SSE42")) {
cpu_info &= ~kCpuHasSSE42;
cpu_info_ &= ~kCpuHasSSE42;
}
if (TestEnv("LIBYUV_DISABLE_AVX")) {
cpu_info &= ~kCpuHasAVX;
cpu_info_ &= ~kCpuHasAVX;
}
if (TestEnv("LIBYUV_DISABLE_AVX2")) {
cpu_info &= ~kCpuHasAVX2;
cpu_info_ &= ~kCpuHasAVX2;
}
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info &= ~kCpuHasERMS;
cpu_info_ &= ~kCpuHasERMS;
}
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info &= ~kCpuHasFMA3;
}
if (TestEnv("LIBYUV_DISABLE_AVX3")) {
cpu_info &= ~kCpuHasAVX3;
cpu_info_ &= ~kCpuHasFMA3;
}
#endif
#if defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
cpu_info |= kCpuHasDSPR2;
cpu_info_ |= kCpuHasMIPS_DSPR2;
#endif
cpu_info |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_DSPR2")) {
cpu_info &= ~kCpuHasDSPR2;
cpu_info_ |= kCpuHasMIPS;
if (getenv("LIBYUV_DISABLE_MIPS")) {
cpu_info_ &= ~kCpuHasMIPS;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
cpu_info_ &= ~kCpuHasMIPS_DSP;
}
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
#endif
#if defined(__arm__) || defined(__aarch64__)
@@ -264,31 +274,28 @@ int InitCpuFlags(void) {
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info = kCpuHasNEON;
cpu_info_ = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#endif
#if defined(__aarch64__)
cpu_info = kCpuHasNEON;
cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.
cpu_info = ArmCpuCaps("/proc/cpuinfo");
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
#endif
cpu_info |= kCpuHasARM;
cpu_info_ |= kCpuHasARM;
if (TestEnv("LIBYUV_DISABLE_NEON")) {
cpu_info &= ~kCpuHasNEON;
cpu_info_ &= ~kCpuHasNEON;
}
#endif // __arm__
if (TestEnv("LIBYUV_DISABLE_ASM")) {
cpu_info = 0;
cpu_info_ = 0;
}
cpu_info |= kCpuInitialized;
cpu_info_ = cpu_info;
return cpu_info;
return cpu_info_;
}
// Note that use of this function is not thread safe.
LIBYUV_API
void MaskCpuFlags(int enable_flags) {
cpu_info_ = InitCpuFlags() & enable_flags;

View File

@@ -59,7 +59,8 @@ const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
// Methods that are passed to jpeglib.
boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
void init_source(jpeg_decompress_struct* cinfo);
void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT
void skip_input_data(jpeg_decompress_struct* cinfo,
long num_bytes); // NOLINT
void term_source(jpeg_decompress_struct* cinfo);
void ErrorHandler(jpeg_common_struct* cinfo);
@@ -428,7 +429,8 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
return TRUE;
}
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
void skip_input_data(j_decompress_ptr cinfo,
long num_bytes) { // NOLINT
cinfo->src->next_input_byte += num_bytes;
}

View File

@@ -17,14 +17,44 @@ namespace libyuv {
extern "C" {
#endif
// Helper function to scan for EOI marker (0xff 0xd9).
// Enable this to try scasb implementation.
// #define ENABLE_SCASB 1
#ifdef ENABLE_SCASB
// Multiple of 1.
__declspec(naked)
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // src
mov eax, [esp + 8] // val
mov ecx, [esp + 12] // count
repne scasb
jne sr99
mov eax, edi
sub eax, 1
mov edi, edx
ret
sr99:
mov eax, 0
mov edi, edx
ret
}
}
#endif
// Helper function to scan for EOI marker.
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
if (sample_size >= 2) {
const uint8* end = sample + sample_size - 1;
const uint8* it = sample;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
for (;;) {
#ifdef ENABLE_SCASB
it = ScanRow_ERMS(it, 0xff, end - it);
#else
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
#endif
if (it == NULL) {
break;
}
@@ -33,26 +63,26 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
}
++it; // Skip over current 0xff.
}
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
}
if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
// Step over SOI marker.
sample += 2;
sample_size -= 2;
// Look for the End Of Image (EOI) marker near the end of the buffer.
// Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
@@ -60,8 +90,8 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1;
}
// Step over SOI marker and scan for EOI.
return ScanEOI(sample + 2, sample_size - 2);
return ScanEOI(sample, sample_size);
}
#ifdef __cplusplus

File diff suppressed because it is too large Load Diff

View File

@@ -49,13 +49,13 @@ void TransposePlane(const uint8* src, int src_stride,
}
}
#endif
#if defined(HAS_TRANSPOSEWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_Fast_DSPR2;
TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
} else {
TransposeWx8 = TransposeWx8_DSPR2;
TransposeWx8 = TransposeWx8_MIPS_DSPR2;
}
}
#endif
@@ -117,6 +117,14 @@ void RotatePlane180(const uint8* src, int src_stride,
}
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
@@ -134,11 +142,11 @@ void RotatePlane180(const uint8* src, int src_stride,
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
MirrorRow = MirrorRow_DSPR2;
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_COPYROW_SSE2)
@@ -196,17 +204,14 @@ void TransposeUV(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
TransposeUVWx8 = TransposeUVWx8_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
}
#endif
#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_DSPR2;
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
}
#endif
@@ -267,22 +272,22 @@ void RotateUV180(const uint8* src, int src_stride,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
MirrorUVRow_C;
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorUVRow = MirrorUVRow_NEON;
MirrorRowUV = MirrorUVRow_NEON;
}
#endif
#if defined(HAS_MIRRORUVROW_SSSE3)
#if defined(HAS_MIRRORROW_UV_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorUVRow = MirrorUVRow_SSSE3;
MirrorRowUV = MirrorUVRow_SSSE3;
}
#endif
#if defined(HAS_MIRRORUVROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorUVRow = MirrorUVRow_DSPR2;
MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
}
#endif
@@ -290,7 +295,7 @@ void RotateUV180(const uint8* src, int src_stride,
dst_b += dst_stride_b * (height - 1);
for (i = 0; i < height; ++i) {
MirrorUVRow(src, dst_a, dst_b, width);
MirrorRowUV(src, dst_a, dst_b, width);
src += src_stride;
dst_a -= dst_stride_a;
dst_b -= dst_stride_b;

View File

@@ -18,7 +18,7 @@ namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
@@ -26,49 +26,24 @@ extern "C" {
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
#endif
#ifdef HAS_TRANSPOSEWX8_DSPR2
TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst_a, int dst_stride_a, \
uint8* dst_b, int dst_stride_b, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, \
n); \
} \
TransposeUVWx8_C(src + n * 2, src_stride, \
dst_a + n * dst_stride_a, dst_stride_a, \
dst_b + n * dst_stride_b, dst_stride_b, r); \
}
#ifdef HAS_TRANSPOSEUVWX8_NEON
TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX8_DSPR2
TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
#endif
#undef TUVANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@@ -17,17 +17,16 @@ extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
@@ -106,17 +105,144 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#endif // defined(HAS_TRANSPOSEWX8_SSSE3)
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
@@ -245,20 +371,17 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
);
}
#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
"1: \n"
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
@@ -357,9 +480,11 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
);
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#endif
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@@ -22,7 +22,7 @@ extern "C" {
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_DSPR2(const uint8* src, int src_stride,
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set push \n"
@@ -106,7 +106,7 @@ void TransposeWx8_DSPR2(const uint8* src, int src_stride,
);
}
void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set noat \n"
@@ -308,7 +308,7 @@ void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
);
}
void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {

View File

@@ -27,7 +27,7 @@ static uvec8 kVTbl4x4Transpose =
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
const uint8* src_temp;
const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@@ -35,6 +35,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %5, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
@@ -229,7 +230,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"4: \n"
: "=&r"(src_temp), // %0
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst), // %3
@@ -247,7 +248,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp;
const uint8* src_temp = NULL;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
@@ -255,6 +256,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %7, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
@@ -512,7 +514,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"4: \n"
: "=&r"(src_temp), // %0
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(src_stride), // %2
"+r"(dst_a), // %3

View File

@@ -26,7 +26,7 @@ static uvec8 kVTbl4x4Transpose =
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
const uint8* src_temp;
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
@@ -235,7 +235,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"4: \n"
: "=&r"(src_temp), // %0
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width64) // %3
@@ -255,7 +255,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp;
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
@@ -520,7 +520,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"4: \n"
: "=&r"(src_temp), // %0
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3

View File

@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,

View File

@@ -22,39 +22,6 @@ extern "C" {
// Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
const uint8* a_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_AVX2
ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422ALPHATOARGBROW_NEON
ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
#endif
#undef ANY41C
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
@@ -73,9 +40,75 @@ ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOYUY2ROW_SSE2
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_I422TORAWROW_AVX2
ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_J422TOARGBROW_SSSE3
ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_J422TOARGBROW_AVX2
ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOBGRAROW_AVX2
ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TOABGRROW_AVX2
ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_AVX2
ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
@@ -83,91 +116,8 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
#undef ANY31
// Note that odd width replication includes 444 due to implementation
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, \
yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
#ifdef HAS_I411TOARGBROW_SSSE3
ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
#endif
#ifdef HAS_I444TOARGBROW_AVX2
ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_I411TOARGBROW_AVX2
ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
#undef ANY31C
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
@@ -186,6 +136,32 @@ ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
// Merge functions.
#ifdef HAS_MERGEUVROW_SSE2
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
@@ -245,55 +221,6 @@ ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
#endif
#undef ANY21
// Any 2 planes to 1 with yuvconstants
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
int width) { \
SIMD_ALIGNED(uint8 temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Biplanar to RGB.
#ifdef HAS_NV12TOARGBROW_SSSE3
ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_SSSE3
ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV21TOARGBROW_AVX2
ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
#endif
#ifdef HAS_NV21TOARGBROW_NEON
ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
#endif
#undef ANY21C
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
@@ -325,10 +252,8 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif
@@ -344,16 +269,15 @@ ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
#if defined(HAS_I400TOARGBROW_AVX2)
ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_SSSE3)
ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7)
#endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
@@ -363,6 +287,10 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
@@ -371,9 +299,8 @@ ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
@@ -454,6 +381,9 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSE2
ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
#endif
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
#endif
@@ -466,44 +396,8 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
#ifdef HAS_ARGBATTENUATEROW_NEON
ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#endif
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 128, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
#endif
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
#undef ANY11B
// Any 1 to 1 with parameter.
#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
@@ -546,35 +440,6 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
#endif
#undef ANY11P
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8 temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
#endif
#if defined(HAS_YUY2TOARGBROW_NEON)
ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
#endif
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
@@ -599,11 +464,14 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
#ifdef HAS_INTERPOLATEROW_SSSE3
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_SSE2
ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_DSPR2
ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
#endif
#undef ANY11T
@@ -628,6 +496,9 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
#ifdef HAS_MIRRORROW_SSSE3
ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
#endif
#ifdef HAS_MIRRORROW_SSE2
ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
#endif
@@ -677,25 +548,9 @@ ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
ANY_SIMD(src_ptr, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
/* repeat last 4 bytes for 422 subsampler */ \
if ((width & 1) && BPP == 4 && DUVSHIFT == 1) { \
if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
} \
/* repeat last 4 - 12 bytes for 411 subsampler */ \
if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
memcpy(temp + SS(r, UVSHIFT) * BPP + BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2); \
} \
if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2); \
} \
if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) { \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
temp + SS(r, UVSHIFT) * BPP - BPP, 4); \
} \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
@@ -711,8 +566,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
#ifdef HAS_SPLITUVROW_NEON
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#endif
#ifdef HAS_SPLITUVROW_DSPR2
ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
#ifdef HAS_SPLITUVROW_MIPS_DSPR2
ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -721,12 +576,16 @@ ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOUV422ROW_SSSE3
ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_SSE2
ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
@@ -748,11 +607,11 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */\
if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, \
temp + SS(r, UVSHIFT) * BPP - BPP, BPP); \
temp + SS(r, UVSHIFT) * BPP - BPP, 4); \
memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \
} \
ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \
memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \
@@ -762,9 +621,6 @@ ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#ifdef HAS_ARGBTOUVROW_AVX2
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVJROW_AVX2
ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -375,12 +375,12 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_MIPS
// DSPR2 functions
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
".set push \n"
@@ -389,6 +389,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
@@ -446,7 +447,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
);
}
void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -456,6 +457,7 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
".p2align 2 \n"
"1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
@@ -496,10 +498,10 @@ void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
);
}
void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int x;
int y;
int x = 0;
int y = 0;
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -510,6 +512,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
@@ -579,7 +582,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v),
[x] "=&r" (x),
[y] "=&r" (y)
[y] "+r" (y)
: [width] "r" (width)
: "t0", "t1", "t2", "t3", "t4",
"t5", "t7", "t8", "t9"
@@ -593,7 +596,7 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
// t8 = | 0 | G1 | 0 | g1 |
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define YUVTORGB \
#define I422ToTransientMipsRGB \
"lw $t0, 0(%[y_buf]) \n" \
"lhu $t1, 0(%[u_buf]) \n" \
"lhu $t2, 0(%[v_buf]) \n" \
@@ -652,12 +655,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
// TODO(fbarchard): accept yuv conversion constants.
void I422ToARGBRow_DSPR2(const uint8* y_buf,
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm__ __volatile__ (
".set push \n"
@@ -672,8 +673,9 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
".p2align 2 \n"
"1: \n"
YUVTORGB
I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
@@ -715,8 +717,134 @@ void I422ToARGBRow_DSPR2(const uint8* y_buf,
);
}
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
"precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
"precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
"precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
"precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
"precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
"or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
"or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
"sll $t9, $t9, 16 \n"
"sll $t8, $t8, 16 \n"
"packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
"packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"beqz %[width], 2f \n"
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
"repl.ph $s4, 16 \n" // |0|16|0|16|
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
".p2align 2 \n"
"1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
"addiu %[y_buf], 4 \n"
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
"sll $t1, $t1, 16 \n"
"sll $t2, $t2, 16 \n"
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
// Store results.
"sw $t2, 0(%[rgb_buf]) \n"
"sw $t0, 4(%[rgb_buf]) \n"
"sw $t1, 8(%[rgb_buf]) \n"
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
"2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
[width] "+r" (width),
[rgb_buf] "+r" (rgb_buf)
:
: "t0", "t1", "t2", "t3", "t4", "t5",
"t6", "t7", "t8", "t9",
"s0", "s1", "s2", "s3",
"s4", "s5", "s6"
);
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;
@@ -729,6 +857,7 @@ void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
".p2align 2 \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

146
third_party/libyuv/source/row_x86.asm vendored Normal file
View File

@@ -0,0 +1,146 @@
;
; Copyright 2012 The LibYuv Project Authors. All rights reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%ifdef __YASM_VERSION_ID__
%if __YASM_VERSION_ID__ < 01020000h
%error AVX2 is supported only by yasm 1.2.0 or later.
%endif
%endif
%include "x86inc.asm"
SECTION .text
; cglobal numeric constants are parameters, gpr regs, mm regs
; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
%macro YUY2TOYROW 2-3
cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
%ifidn %1,YUY2
pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
psrlw m2, m2, 8
%endif
ALIGN 4
.convertloop:
mov%2 m0, [src_yuy2q]
mov%2 m1, [src_yuy2q + mmsize]
lea src_yuy2q, [src_yuy2q + mmsize * 2]
%ifidn %1,YUY2
pand m0, m0, m2 ; YUY2 even bytes are Y
pand m1, m1, m2
%else
psrlw m0, m0, 8 ; UYVY odd bytes are Y
psrlw m1, m1, 8
%endif
packuswb m0, m0, m1
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
%endif
sub pixd, mmsize
mov%2 [dst_yq], m0
lea dst_yq, [dst_yq + mmsize]
jg .convertloop
REP_RET
%endmacro
; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
INIT_MMX MMX
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_XMM SSE2
YUY2TOYROW YUY2,a,
YUY2TOYROW YUY2,u,_Unaligned
YUY2TOYROW UYVY,a,
YUY2TOYROW UYVY,u,_Unaligned
INIT_YMM AVX2
YUY2TOYROW YUY2,a,
YUY2TOYROW UYVY,a,
; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
%macro SplitUVRow 1-2
cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
psrlw m4, m4, 8
sub dst_vq, dst_uq
ALIGN 4
.convertloop:
mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize]
lea src_uvq, [src_uvq + mmsize * 2]
psrlw m2, m0, 8 ; odd bytes
psrlw m3, m1, 8
pand m0, m0, m4 ; even bytes
pand m1, m1, m4
packuswb m0, m0, m1
packuswb m2, m2, m3
%if cpuflag(AVX2)
vpermq m0, m0, 0xd8
vpermq m2, m2, 0xd8
%endif
mov%1 [dst_uq], m0
mov%1 [dst_uq + dst_vq], m2
lea dst_uq, [dst_uq + mmsize]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
SplitUVRow a,
SplitUVRow u,_Unaligned
INIT_XMM SSE2
SplitUVRow a,
SplitUVRow u,_Unaligned
INIT_YMM AVX2
SplitUVRow a,
; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
; int width);
%macro MergeUVRow_ 1-2
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
ALIGN 4
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
lea src_uq, [src_uq + mmsize]
punpcklbw m2, m0, m1 // first 8 UV pairs
punpckhbw m0, m0, m1 // next 8 UV pairs
%if cpuflag(AVX2)
vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
mov%1 [dst_uvq], m1
mov%1 [dst_uvq + mmsize], m2
%else
mov%1 [dst_uvq], m2
mov%1 [dst_uvq + mmsize], m0
%endif
lea dst_uvq, [dst_uvq + mmsize * 2]
sub pixd, mmsize
jg .convertloop
REP_RET
%endmacro
INIT_MMX MMX
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
INIT_XMM SSE2
MergeUVRow_ a,
MergeUVRow_ u,_Unaligned
INIT_YMM AVX2
MergeUVRow_ a,

View File

@@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
ScaleRowDown2Box_Any_SSSE3);
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
ScaleRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
ScaleRowDown2Box_SSSE3);
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
}
}
#endif
@@ -85,12 +85,12 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ?
ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
}
#endif
@@ -135,12 +135,12 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
ScaleRowDown2Box_16_SSE2);
}
#endif
#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown2 = filtering ?
ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
}
#endif
@@ -182,12 +182,12 @@ static void ScalePlaneDown4(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
#if defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
}
#endif
@@ -200,12 +200,12 @@ static void ScalePlaneDown4(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
}
#endif
@@ -245,12 +245,12 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
ScaleRowDown4_16_SSE2;
}
#endif
#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
}
#endif
@@ -325,16 +325,16 @@ static void ScalePlaneDown34(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
}
}
#endif
@@ -404,16 +404,16 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
}
}
#endif
@@ -517,16 +517,16 @@ static void ScalePlaneDown38(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN38_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
}
}
#endif
@@ -595,16 +595,16 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
}
}
#endif
#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
}
}
#endif
@@ -659,6 +659,7 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -666,8 +667,7 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
scaletbl[boxwidth - minboxwidth] >> 16;
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
@@ -676,6 +676,7 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
@@ -683,8 +684,8 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
int ix = x >> 16;
x += dx;
boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
scaletbl[boxwidth - minboxwidth] >> 16;
*dst_ptr++ =
SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
@@ -874,6 +875,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -898,11 +907,11 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1002,11 +1011,11 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif
@@ -1063,6 +1072,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -1087,11 +1104,11 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1226,11 +1243,11 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif

View File

@@ -55,29 +55,12 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
dst_ptr + n * BPP, r); \
}
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
#ifdef HAS_SCALEROWDOWN2_SSE2
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
@@ -85,8 +68,6 @@ SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
2, 1, 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
@@ -94,12 +75,10 @@ SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_Odd_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
#ifdef HAS_SCALEROWDOWN4_SSE2
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2

View File

@@ -210,6 +210,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
src_argb += xl * 4;
x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -234,12 +242,12 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -300,6 +308,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -324,10 +340,10 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
if (src_width >= 32768) {
@@ -465,19 +481,27 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
}
#endif
#if defined(HAS_I422TOARGBROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
I422ToARGBRow = I422ToARGBRow_DSPR2;
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -502,10 +526,10 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
@@ -823,36 +847,6 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint32 src_fourcc,
int src_width, int src_height,
uint8* dst_argb, int dst_stride_argb,
uint32 dst_fourcc,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering) {
uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
int r;
I420ToARGB(src_y, src_stride_y,
src_u, src_stride_u,
src_v, src_stride_v,
argb_buffer, src_width * 4,
src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4,
src_width, src_height,
dst_argb, dst_stride_argb,
dst_width, dst_height,
clip_x, clip_y, clip_width, clip_height,
filtering);
free(argb_buffer);
return r;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@@ -103,28 +103,6 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* s = src_ptr;
const uint8* t = src_ptr + src_stride;
int x;
dst_width -= 1;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
dst += 2;
s += 4;
t += 4;
}
if (dst_width & 1) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
dst += 1;
s += 2;
t += 2;
}
dst[0] = (s[0] + t[0] + 1) >> 1;
}
void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst, int dst_width) {
const uint16* s = src_ptr;
@@ -417,14 +395,8 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) (uint8)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// inteluses 7 bit math with rounding.
#define BLENDER(a, b, f) (uint8)((int)(a) + \
(((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
@@ -476,9 +448,8 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
}
#undef BLENDER
// Same as 8 bit arm blender but return is cast to uint16
#define BLENDER(a, b, f) (uint16)((int)(a) + \
((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) {
@@ -816,7 +787,6 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
}
}
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
#define BLENDERC(a, b, f, s) (uint32)( \
@@ -906,6 +876,14 @@ void ScalePlaneVertical(int src_height,
assert(dst_width > 0);
assert(dst_height > 0);
src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -930,13 +908,13 @@ void ScalePlaneVertical(int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_DSPR2;
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
}
#endif
@@ -1004,13 +982,13 @@ void ScalePlaneVertical_16(int src_height,
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_DSPR2)
if (TestCpuFlag(kCpuHasDSPR2) &&
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_DSPR2;
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width_bytes, 4)) {
InterpolateRow = InterpolateRow_16_DSPR2;
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
}
}
#endif

View File

@@ -9,7 +9,6 @@
*/
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -17,8 +16,7 @@ extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@@ -98,7 +96,7 @@ static uvec16 kScaleAb2 =
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
@@ -120,23 +118,25 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm5,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
@@ -145,17 +145,15 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
@@ -164,16 +162,16 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"psrlw $0x1,%%xmm0 \n"
"psrlw $0x1,%%xmm1 \n"
"pavgw %%xmm5,%%xmm0 \n"
"pavgw %%xmm5,%%xmm1 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
@@ -188,105 +186,7 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
#ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1"
);
}
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -314,15 +214,12 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stridex3;
intptr_t stridex3 = 0;
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"packuswb %%xmm4,%%xmm4 \n"
"psllw $0x3,%%xmm5 \n"
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0x8,%%xmm7 \n"
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
LABELALIGN
@@ -331,28 +228,30 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
"lea " MEMLEA(0x20,0) ",%0 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x4,%%xmm0 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm5,%%xmm3 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm7,%%xmm2 \n"
"pand %%xmm7,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm7,%%xmm2 \n"
"pavgw %%xmm2,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
@@ -361,100 +260,13 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"=&r"(stridex3) // %3
"+r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
);
}
#ifdef HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpand %%ymm5,%%ymm0,%%ymm0 \n"
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
"lea " MEMLEA(0x40,0) ",%0 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
"vpsrlw $0x4,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(src_stride * 3)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
@@ -762,89 +574,61 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
}
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
"movdqa %%xmm3,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"punpckhbw %%xmm5,%%xmm3 \n"
"movdqu " MEMACCESS(3) ",%%xmm2 \n"
"add %6,%3 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n"
"jg 1b \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
"+r"(tmp_height), // %2
"+r"(tmp_src), // %3
"+r"(src_width), // %4
"+rm"(src_height) // %5
: "rm"((intptr_t)(src_stride)) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
asm volatile (
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
"vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
"vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version.
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
intptr_t x0, x1, temp_pixel;
intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
asm volatile (
"movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n" // 0x007f007f
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" // 0x00010001
"psrlw $0x9,%%xmm6 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"subl $0x2,%5 \n"
"jl 29f \n"
@@ -866,19 +650,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm0 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
"paddusb %%xmm7,%%xmm1 \n"
"pmaddubsw %%xmm0,%%xmm1 \n"
"pxor %%xmm6,%%xmm1 \n"
"pmaddubsw %%xmm1,%%xmm0 \n"
"pextrw $0x1,%%xmm2,%k3 \n"
"pextrw $0x3,%%xmm2,%k4 \n"
"paddw %9,%%xmm1 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"movd %%xmm1,%k2 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"mov %w2," MEMACCESS(0) " \n"
"lea " MEMLEA(0x2,0) ",%0 \n"
"subl $0x2,%5 \n"
"sub $0x2,%5 \n"
"jge 2b \n"
LABELALIGN
@@ -889,37 +670,23 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm2 \n"
"pshufb %%xmm5,%%xmm2 \n"
"psubb %8,%%xmm0 \n" // make pixels signed.
"pxor %%xmm6,%%xmm2 \n"
"paddusb %%xmm7,%%xmm2 \n"
"pmaddubsw %%xmm0,%%xmm2 \n"
"paddw %9,%%xmm2 \n" // make pixels unsigned.
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n"
"movd %%xmm2,%k2 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,%k2 \n"
"mov %b2," MEMACCESS(0) " \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"=&a"(temp_pixel), // %2
"=&r"(x0), // %3
"=&r"(x1), // %4
#if defined(__x86_64__)
"+a"(temp_pixel), // %2
"+r"(x0), // %3
"+r"(x1), // %4
"+rm"(dst_width) // %5
#else
"+m"(dst_width) // %5
#endif
: "rm"(x), // %6
"rm"(dx), // %7
#if defined(__x86_64__)
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
"rm"(dx) // %7
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@@ -1028,7 +795,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t src_stepx_x12 = 0;
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
"lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
@@ -1050,7 +817,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
"+r"(dst_width), // %3
"=&r"(src_stepx_x12) // %4
"+r"(src_stepx_x12) // %4
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
@@ -1062,7 +829,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, int src_stepx,
uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t src_stepx_x12 = 0;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile (
"lea " MEMLEA3(0x00,1,4) ",%1 \n"
@@ -1095,7 +862,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
"+rm"(dst_width), // %3
"=&r"(src_stepx_x12), // %4
"+r"(src_stepx_x12), // %4
"+r"(row1) // %5
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
@@ -1104,7 +871,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
intptr_t x0, x1;
intptr_t x0 = 0, x1 = 0;
asm volatile (
"movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
@@ -1157,8 +924,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
"movd %%xmm0," MEMACCESS(2) " \n"
"99: \n"
: "=&a"(x0), // %0
"=&d"(x1), // %1
: "+a"(x0), // %0
"+d"(x1), // %1
"+r"(dst_argb), // %2
"+r"(src_argb), // %3
"+r"(dst_width) // %4
@@ -1209,7 +976,7 @@ static uvec8 kShuffleFractions = {
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
intptr_t x0, x1;
intptr_t x0 = 0, x1 = 0;
asm volatile (
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
@@ -1272,8 +1039,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+rm"(dst_width), // %2
"=&r"(x0), // %3
"=&r"(x1) // %4
"+r"(x0), // %3
"+r"(x1) // %4
: "rm"(x), // %5
"rm"(dx) // %6
: "memory", "cc", NACL_R14

View File

@@ -21,7 +21,7 @@ extern "C" {
defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__(
".set push \n"
@@ -31,6 +31,7 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -77,7 +78,7 @@ void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
const uint8* t = src_ptr + src_stride;
@@ -89,6 +90,7 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -176,7 +178,7 @@ void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
@@ -186,6 +188,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"beqz $t9, 2f \n"
" nop \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -231,7 +234,7 @@ void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
@@ -245,6 +248,7 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
@@ -310,11 +314,12 @@ void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -356,13 +361,14 @@ void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t3, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -412,13 +418,14 @@ void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* d, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -464,12 +471,13 @@ void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -510,7 +518,7 @@ void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
const uint8* t = src_ptr + stride;
@@ -520,6 +528,7 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
@@ -563,7 +572,7 @@ void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
intptr_t stride = src_stride;
@@ -577,6 +586,7 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
".set push \n"
".set noreorder \n"
".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|

View File

@@ -26,6 +26,7 @@ extern "C" {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
@@ -46,6 +47,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
@@ -71,6 +73,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
@@ -98,6 +101,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -119,6 +123,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
const uint8* src_ptr2 = src_ptr + src_stride * 2;
const uint8* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
@@ -157,6 +162,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -179,6 +185,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -238,6 +245,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
asm volatile (
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
@@ -292,6 +300,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
asm volatile (
MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@@ -325,6 +334,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
MEMACCESS(7)
"vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -440,6 +450,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
MEMACCESS(5)
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -532,8 +543,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp;
const uint8* src_tmp = NULL;
asm volatile (
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
@@ -552,7 +564,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "=&r"(src_tmp), // %0
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
@@ -572,16 +584,13 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
// The NEON version mimics this formula:
// #define BLENDER(a, b, f) (uint8)((int)(a) +
// ((int)(f) * ((int)(b) - (int)(a)) >> 16))
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
@@ -612,8 +621,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
"vrshrn.s32 d18, q11, #16 \n"
"vrshrn.s32 d19, q12, #16 \n"
"vshrn.s32 d18, q11, #16 \n"
"vshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
@@ -740,6 +749,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
MEMACCESS(0)
@@ -763,6 +773,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -793,6 +804,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@@ -833,6 +845,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width) {
asm volatile (
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n"
@@ -862,6 +875,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
asm volatile (
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
@@ -913,9 +927,10 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int tmp;
int tmp = 0;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"1: \n"
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
@@ -935,7 +950,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"=&r"(tmp), // %5
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
@@ -959,6 +974,7 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3

View File

@@ -547,7 +547,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp;
const uint8* src_tmp = NULL;
asm volatile (
"1: \n"
"mov %0, %1 \n"
@@ -567,7 +567,7 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n"
: "=&r"(src_tmp), // %0
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
@@ -626,8 +626,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"rshrn v6.4h, v16.4s, #16 \n"
"rshrn2 v6.8h, v17.4s, #16 \n"
"shrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
@@ -931,7 +931,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 tmp64;
int64 tmp64 = 0;
asm volatile (
"1: \n"
LOAD1_DATA32_LANE(v0, 0)
@@ -952,7 +952,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"=&r"(tmp64), // %5
"+r"(tmp64), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1"

View File

@@ -16,8 +16,9 @@ namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@@ -95,7 +96,7 @@ static uvec16 kScaleAb2 =
// Reads 32 pixels, throws half away and writes 16 pixels.
__declspec(naked)
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -121,28 +122,31 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x1 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -154,7 +158,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x2 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
@@ -162,11 +166,8 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
packuswb xmm4, xmm4
pxor xmm5, xmm5 // constant 0
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
wloop:
movdqu xmm0, [eax]
@@ -174,17 +175,19 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // vertical add
paddw xmm1, xmm3
psrlw xmm0, 1
psrlw xmm1, 1
pavgw xmm0, xmm5 // (x + 1) / 2
pavgw xmm1, xmm5
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -243,12 +246,14 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -259,8 +264,6 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -278,23 +281,19 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + esi]
vmovdqu ymm3, [eax + esi + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add
vpaddw ymm1, ymm1, ymm3
vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2
vpsrlw ymm1, ymm1, 1
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
@@ -309,7 +308,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
// Point samples 32 pixels to 8 pixels.
__declspec(naked)
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -340,7 +339,7 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// Blends 32x4 rectangle to 8x1.
__declspec(naked)
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
@@ -350,40 +349,42 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
pcmpeqb xmm4, xmm4 // constant 0x0101
psrlw xmm4, 15
movdqa xmm5, xmm4
packuswb xmm4, xmm4
psllw xmm5, 3 // constant 0x0008
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8
wloop:
movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
pmaddubsw xmm0, xmm4 // horizontal add
pmaddubsw xmm1, xmm4
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // vertical add rows 0, 1
paddw xmm1, xmm3
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // add row 2
paddw xmm1, xmm3
movdqu xmm2, [eax + edi]
movdqu xmm3, [eax + edi + 16]
movdqu xmm4, [eax + edi]
movdqu xmm5, [eax + edi + 16]
lea eax, [eax + 32]
pmaddubsw xmm2, xmm4
pmaddubsw xmm3, xmm4
paddw xmm0, xmm2 // add row 3
paddw xmm1, xmm3
phaddw xmm0, xmm1
paddw xmm0, xmm5 // + 8 for round
psrlw xmm0, 4 // /16 for average of 4 * 4
pavgb xmm2, xmm4
pavgb xmm3, xmm5
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm7
pand xmm3, xmm7
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
psrlw xmm0, 8
pand xmm2, xmm7
pavgw xmm0, xmm2
packuswb xmm0, xmm0
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
@@ -442,41 +443,37 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
vpsrlw ymm4, ymm4, 15
vpsllw ymm5, ymm4, 3 // constant 0x0008
vpackuswb ymm4, ymm4, ymm4
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
vpsrlw ymm7, ymm7, 8
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vmovdqu ymm2, [eax + esi]
vmovdqu ymm3, [eax + esi + 32]
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
vpmaddubsw ymm1, ymm1, ymm4
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
vpaddw ymm1, ymm1, ymm3
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // add row 2
vpaddw ymm1, ymm1, ymm3
vmovdqu ymm2, [eax + edi]
vmovdqu ymm3, [eax + edi + 32]
vpavgb ymm2, ymm2, [eax + edi]
vpavgb ymm3, ymm3, [eax + edi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm2, ymm2, ymm4
vpmaddubsw ymm3, ymm3, ymm4
vpaddw ymm0, ymm0, ymm2 // add row 3
vpaddw ymm1, ymm1, ymm3
vphaddw ymm0, ymm0, ymm1 // mutates
vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
vpaddw ymm0, ymm0, ymm5 // + 8 for round
vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
vpavgb ymm0, ymm0, ymm2
vpavgb ymm1, ymm1, ymm3
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
vpand ymm3, ymm1, ymm7
vpsrlw ymm0, ymm0, 8
vpsrlw ymm1, ymm1, 8
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
vpsrlw ymm0, ymm0, 8
vpavgw ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
@@ -502,9 +499,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
movdqa xmm3, xmmword ptr kShuf0
movdqa xmm4, xmmword ptr kShuf1
movdqa xmm5, xmmword ptr kShuf2
movdqa xmm3, kShuf0
movdqa xmm4, kShuf1
movdqa xmm5, kShuf2
wloop:
movdqu xmm0, [eax]
@@ -551,12 +548,12 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, xmmword ptr kShuf21
movdqa xmm5, xmmword ptr kMadd01
movdqa xmm6, xmmword ptr kMadd11
movdqa xmm7, xmmword ptr kRound34
movdqa xmm2, kShuf01
movdqa xmm3, kShuf11
movdqa xmm4, kShuf21
movdqa xmm5, kMadd01
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
wloop:
movdqu xmm0, [eax] // pixels 0..7
@@ -582,7 +579,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
movdqa xmm1, xmmword ptr kMadd21
movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
@@ -608,12 +605,12 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShuf01
movdqa xmm3, xmmword ptr kShuf11
movdqa xmm4, xmmword ptr kShuf21
movdqa xmm5, xmmword ptr kMadd01
movdqa xmm6, xmmword ptr kMadd11
movdqa xmm7, xmmword ptr kRound34
movdqa xmm2, kShuf01
movdqa xmm3, kShuf11
movdqa xmm4, kShuf21
movdqa xmm5, kMadd01
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
wloop:
movdqu xmm0, [eax] // pixels 0..7
@@ -642,7 +639,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm4
movdqa xmm1, xmmword ptr kMadd21
movdqa xmm1, kMadd21
pmaddubsw xmm0, xmm1
paddsw xmm0, xmm7
psrlw xmm0, 2
@@ -668,8 +665,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
movdqa xmm4, xmmword ptr kShuf38a
movdqa xmm5, xmmword ptr kShuf38b
movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b
xloop:
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
@@ -701,9 +698,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShufAc
movdqa xmm3, xmmword ptr kShufAc3
movdqa xmm4, xmmword ptr kScaleAc33
movdqa xmm2, kShufAc
movdqa xmm3, kShufAc3
movdqa xmm4, kScaleAc33
pxor xmm5, xmm5
xloop:
@@ -766,10 +763,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
movdqa xmm2, xmmword ptr kShufAb0
movdqa xmm3, xmmword ptr kShufAb1
movdqa xmm4, xmmword ptr kShufAb2
movdqa xmm5, xmmword ptr kScaleAb2
movdqa xmm2, kShufAb0
movdqa xmm3, kShufAb1
movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2
xloop:
movdqu xmm0, [eax] // average 2 rows into xmm0
@@ -860,16 +857,6 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static uvec8 kFsub80 =
{ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
// Constant for making pixels unsigned and adding .5 for rounding.
static uvec16 kFadd40 =
{ 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
// Bilinear column filtering. SSSE3 version.
__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -887,8 +874,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm5, eax
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
pcmpeqb xmm7, xmm7 // generate 0x0001
psrlw xmm7, 15
pextrw eax, xmm2, 1 // get x0 integer. preroll
sub ecx, 2
jl xloop29
@@ -911,22 +896,20 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm4, ebx
pshufb xmm1, xmm5 // 0011
punpcklwd xmm0, xmm4
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm1, xmm6 // 0..7f and 7f..0
paddusb xmm1, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels.
pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
pextrw eax, xmm2, 1 // get x0 integer. next iteration.
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round.
psrlw xmm1, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm1, xmm1 // 8 bits, 2 pixels.
movd ebx, xmm1
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm0, xmm0 // 8 bits, 2 pixels.
movd ebx, xmm0
mov [edi], bx
lea edi, [edi + 2]
sub ecx, 2 // 2 pixels
jge xloop2
xloop29:
add ecx, 2 - 1
jl xloop99
@@ -935,14 +918,11 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
pshufb xmm2, xmm5 // 0011
psubb xmm0, xmmword ptr kFsub80 // make pixels signed.
pxor xmm2, xmm6 // 0..7f and 7f..0
paddusb xmm2, xmm7 // +1 so 0..7f and 80..1
pmaddubsw xmm2, xmm0 // 16 bit
paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round.
psrlw xmm2, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm2, xmm2 // 8 bits
movd ebx, xmm2
pmaddubsw xmm0, xmm2 // 16 bit
psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
packuswb xmm0, xmm0 // 8 bits
movd ebx, xmm0
mov [edi], bl
xloop99:
@@ -1253,8 +1233,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
mov ecx, [esp + 8 + 12] // dst_width
movd xmm2, [esp + 8 + 16] // x
movd xmm3, [esp + 8 + 20] // dx
movdqa xmm4, xmmword ptr kShuffleColARGB
movdqa xmm5, xmmword ptr kShuffleFractions
movdqa xmm4, kShuffleColARGB
movdqa xmm5, kShuffleFractions
pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
psrlw xmm6, 9
pextrw eax, xmm2, 1 // get x0 integer. preroll

View File

@@ -25,7 +25,6 @@ struct FourCCAliasEntry {
static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
{FOURCC_YU24, FOURCC_I444},
{FOURCC_YUYV, FOURCC_YUY2},

1136
third_party/libyuv/source/x86inc.asm vendored Normal file

File diff suppressed because it is too large Load Diff