Merge "libyuv: update to r1305"

This commit is contained in:
Johann 2015-03-03 09:20:02 -08:00 committed by Gerrit Code Review
commit 6cf7b3b240
46 changed files with 7713 additions and 10701 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1060
Version: 1305
License: BSD
License File: LICENSE
@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
None.

View File

@ -22,6 +22,11 @@ extern "C" {
LIBYUV_API
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
uint64 ComputeSumSquareError(const uint8* src_a,

View File

@ -113,15 +113,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert Q420 to I420.
LIBYUV_API
int Q420ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@ -211,8 +202,6 @@ int MJPGSize(const uint8* sample, size_t sample_size,
int* width, int* height);
#endif
// Note Bayer formats (BGGR) To I420 are in format_conversion.h
// Convert camera sample to I420 with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_y" number of bytes in a row of the dst_y plane.

View File

@ -18,7 +18,6 @@
#include "libyuv/rotate.h"
// TODO(fbarchard): This set of functions should exactly match convert.h
// Add missing Q420.
// TODO(fbarchard): Add tests. Create random content of right size and convert
// with C vs Opt and or to I420 and compare.
// TODO(fbarchard): Some of these functions lack parameter setting.
@ -104,13 +103,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// TODO(fbarchard): Convert Q420 to ARGB.
// LIBYUV_API
// int Q420ToARGB(const uint8* src_y, int src_stride_y,
// const uint8* src_yuy2, int src_stride_yuy2,
// uint8* dst_argb, int dst_stride_argb,
// int width, int height);
// Convert YUY2 to ARGB.
LIBYUV_API
int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@ -123,6 +115,22 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@ -184,8 +192,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
int dst_width, int dst_height);
#endif
// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
// "src_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.

View File

@ -57,7 +57,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
int width, int height);
// TODO(fbarchard): I420ToM420
// TODO(fbarchard): I420ToQ420
LIBYUV_API
int I420ToNV12(const uint8* src_y, int src_stride_y,
@ -152,8 +151,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.

View File

@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
// Values in dither matrix from 0 to 255. 128 is best for no dither.
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@ -105,6 +112,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert ARGB to J422.
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_yj, int dst_stride_yj,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Convert ARGB To I411.
LIBYUV_API
int ARGBToI411(const uint8* src_argb, int src_stride_argb,

View File

@ -1,168 +0,0 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Convert Bayer RGB formats to I420.
LIBYUV_API
int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height);
// Temporary API mapper.
#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
LIBYUV_API
int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
uint32 src_fourcc_bayer);
// Convert I420 to Bayer RGB formats.
LIBYUV_API
int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
LIBYUV_API
int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
LIBYUV_API
int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
LIBYUV_API
int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Temporary API mapper.
#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
LIBYUV_API
int I420ToBayer(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
int width, int height,
uint32 dst_fourcc_bayer);
// Convert Bayer RGB formats to ARGB.
LIBYUV_API
int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
LIBYUV_API
int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
LIBYUV_API
int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
LIBYUV_API
int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Temporary API mapper.
#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
LIBYUV_API
int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height,
uint32 src_fourcc_bayer);
// Converts ARGB to Bayer RGB formats.
LIBYUV_API
int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height);
LIBYUV_API
int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height);
LIBYUV_API
int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height);
LIBYUV_API
int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height);
// Temporary API mapper.
#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
LIBYUV_API
int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height,
uint32 dst_fourcc_bayer);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT

File diff suppressed because it is too large Load Diff

View File

@ -34,6 +34,7 @@ void ScalePlane(const uint8* src, int src_stride,
int dst_width, int dst_height,
enum FilterMode filtering);
LIBYUV_API
void ScalePlane_16(const uint16* src, int src_stride,
int src_width, int src_height,
uint16* dst, int dst_stride,

View File

@ -44,21 +44,13 @@ extern "C" {
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__aarch64__) || defined(LIBYUV_NEON))
/* #define HAS_SCALEROWDOWN2_NEON */
/* #define HAS_SCALEROWDOWN4_NEON */
/* #define HAS_SCALEROWDOWN34_NEON */
/* #define HAS_SCALEROWDOWN38_NEON */
/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
/* #define HAS_SCALEARGBROWDOWN2_NEON */
#endif
// The following are available on Mips platforms:
@ -208,15 +200,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -267,10 +250,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Row functions.
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1059
#define LIBYUV_VERSION 1305
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -62,7 +62,7 @@ enum FourCC {
// 2 Secondary YUV formats: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@ -75,7 +75,7 @@ enum FourCC {
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
// 4 Secondary RGB formats: 4 Bayer Patterns.
// 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),

View File

@ -19,6 +19,7 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
namespace libyuv {
@ -78,6 +79,54 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
return seed;
}
static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
return FOURCC_BGRA;
}
if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
return FOURCC_ARGB;
}
argb += 8;
}
if (width & 1) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
return FOURCC_BGRA;
}
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
return FOURCC_ARGB;
}
}
return 0;
}
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
uint32 fourcc = 0;
int h;
// Coalesce rows.
if (stride_argb == width * 4) {
width *= height;
height = 1;
stride_argb = 0;
}
for (h = 0; h < height && fourcc == 0; ++h) {
fourcc = ARGBDetectRow_C(argb, width);
argb += stride_argb;
}
return fourcc;
}
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@ -114,8 +163,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}

View File

@ -16,7 +16,8 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
@ -56,46 +57,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"bgt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // __ARM_NEON__
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"

View File

@ -0,0 +1,63 @@
/*
* Copyright 2012 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -25,11 +25,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n"
@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n"
@ -53,11 +53,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
); // NOLINT
return sse;
}
@ -124,13 +120,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n"
"movd %%xmm0,%3 \n"
: "+r"(src), // %0
@ -143,9 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
); // NOLINT
return hash;
}

View File

@ -27,13 +27,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pxor xmm0, xmm0
pxor xmm5, xmm5
align 4
wloop:
movdqa xmm1, [eax]
movdqu xmm1, [eax]
lea eax, [eax + 16]
movdqa xmm2, [edx]
movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2
psubusb xmm2, xmm3
@ -45,6 +43,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2
paddd xmm0, xmm1
paddd xmm0, xmm2
sub ecx, 16
jg wloop
pshufd xmm1, xmm0, 0xee
@ -70,12 +69,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
sub edx, eax
align 4
wloop:
vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3
@ -85,6 +82,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pxor xmm7, xmm7 // constant 0 for unpck
movdqa xmm6, kHash16x33
align 4
wloop:
movdqu xmm1, [eax] // src[0-15]
lea eax, [eax + 16]
@ -170,7 +167,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
@ -178,6 +174,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash
@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
movd xmm0, [esp + 12] // seed
movdqa xmm6, kHash16x33
align 4
wloop:
vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
pmulld xmm0, xmm6 // hash *= 33 ^ 16
@ -209,13 +205,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2
paddd xmm0, xmm1
sub ecx, 16
jg wloop
movd eax, xmm0 // return hash

View File

@ -188,17 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) {
int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src, 16) &&
IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2;
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@ -207,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
@ -283,20 +280,15 @@ static int X420ToI420(const uint8* src_y,
src_stride_uv = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_SPLITUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
SplitUVRow = SplitUVRow_SSE2;
}
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
SplitUVRow = SplitUVRow_AVX2;
@ -304,7 +296,7 @@ static int X420ToI420(const uint8* src_y,
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_NEON;
@ -312,17 +304,15 @@ static int X420ToI420(const uint8* src_y,
}
#endif
#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(halfwidth, 16)) {
SplitUVRow = SplitUVRow_MIPS_DSPR2;
}
}
}
#endif
if (dst_y) {
@ -391,125 +381,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
width, height);
}
// Convert Q420 to I420.
// Format is rows of YY/YUYV
LIBYUV_API
int Q420ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
int halfheight;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) = YUY2ToUV422Row_C;
void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
YUY2ToYRow_C;
if (!src_y || !src_yuy2 ||
!dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
// CopyRow for rows of just Y in Q420 copied to Y plane of I420.
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
CopyRow = CopyRow_SSE2;
}
#endif
#if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
}
#endif
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUV422Row = YUY2ToUV422Row_NEON;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
CopyRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width);
src_yuy2 += src_stride_yuy2;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
CopyRow(src_y, dst_y, width);
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
}
return 0;
}
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
@ -529,23 +400,17 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = -src_stride_yuy2;
}
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUVRow = YUY2ToUVRow_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -555,11 +420,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
}
if (IS_ALIGNED(width, 16)) {
YUY2ToYRow = YUY2ToYRow_NEON;
YUY2ToUVRow = YUY2ToUVRow_NEON;
@ -602,23 +465,17 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = -src_stride_uyvy;
}
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
UYVYToUVRow = UYVYToUVRow_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUVRow = UYVYToUVRow_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -628,11 +485,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width >= 16) {
UYVYToUVRow = UYVYToUVRow_Any_NEON;
}
if (IS_ALIGNED(width, 16)) {
UYVYToYRow = UYVYToYRow_NEON;
UYVYToUVRow = UYVYToUVRow_NEON;
@ -680,23 +535,17 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -706,7 +555,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -714,7 +563,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@ -761,34 +610,31 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
src_bgra = src_bgra + (height - 1) * src_stride_bgra;
src_stride_bgra = -src_stride_bgra;
}
#if defined(HAS_BGRATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
BGRAToYRow = BGRAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
BGRAToUVRow = BGRAToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
BGRAToYRow = BGRAToYRow_SSSE3;
}
}
}
}
#elif defined(HAS_BGRATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_BGRATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
BGRAToYRow = BGRAToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
BGRAToYRow = BGRAToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_BGRATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
BGRAToUVRow = BGRAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
BGRAToUVRow = BGRAToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
@ -830,34 +676,31 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
}
}
#elif defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ABGRToYRow = ABGRToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVRow = ABGRToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
@ -899,34 +742,31 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
#if defined(HAS_RGBATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
RGBAToUVRow = RGBAToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
RGBAToYRow = RGBAToYRow_SSSE3;
}
}
}
}
#elif defined(HAS_RGBATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_RGBATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYRow = RGBAToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGBAToYRow = RGBAToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RGBATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToUVRow = RGBAToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_NEON;
}
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
@ -978,22 +818,23 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#if defined(HAS_RGB24TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToYRow = RGB24ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
}
#else // HAS_RGB24TOYROW_NEON
#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
@ -1001,7 +842,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1009,17 +850,13 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON
{
#if !defined(HAS_RGB24TOYROW_NEON)
@ -1095,22 +932,23 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#if defined(HAS_RAWTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
RAWToYRow = RAWToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON;
}
if (width >= 16) {
}
#endif
#if defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
}
}
#else // HAS_RAWTOYROW_NEON
#endif
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
@ -1118,7 +956,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1126,17 +964,13 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON
{
// Allocate 2 rows of ARGB.
@ -1210,22 +1044,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#if defined(HAS_RGB565TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToYRow = RGB565ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB565ToYRow = RGB565ToYRow_NEON;
}
if (width >= 16) {
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB565ToUVRow = RGB565ToUVRow_NEON;
}
}
}
#else // HAS_RGB565TOYROW_NEON
#if defined(HAS_RGB565TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
@ -1233,7 +1065,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1241,15 +1073,12 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB565TOYROW_NEON
@ -1327,22 +1156,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#if defined(HAS_ARGB1555TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToYRow = ARGB1555ToYRow_NEON;
}
if (width >= 16) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
}
}
}
#else // HAS_ARGB1555TOYROW_NEON
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
@ -1350,7 +1177,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1358,15 +1185,12 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB1555TOYROW_NEON
@ -1445,22 +1269,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#if defined(HAS_ARGB4444TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToYRow = ARGB4444ToYRow_NEON;
}
if (width >= 16) {
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
}
}
}
#else // HAS_ARGB4444TOYROW_NEON
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
@ -1468,7 +1290,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
@ -1476,15 +1298,12 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB4444TOYROW_NEON

View File

@ -11,7 +11,6 @@
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
@ -79,17 +78,15 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I444TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I444ToARGBRow = I444ToARGBRow_SSSE3;
}
}
}
#elif defined(HAS_I444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I444ToARGBRow = I444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I444ToARGBRow = I444ToARGBRow_NEON;
@ -141,18 +138,15 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@ -160,7 +154,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@ -221,17 +215,15 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_I411TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I411ToARGBRow = I411ToARGBRow_SSSE3;
}
}
}
#elif defined(HAS_I411TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I411TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I411ToARGBRow = I411ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I411ToARGBRow = I411ToARGBRow_NEON;
@ -276,15 +268,23 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_YTOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
YToARGBRow = YToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2;
}
}
#elif defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_YTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YToARGBRow = YToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
YToARGBRow = YToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_YTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON;
@ -326,17 +326,15 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_I400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
if (TestCpuFlag(kCpuHasSSE2)) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I400ToARGBRow = I400ToARGBRow_SSE2;
}
}
}
#elif defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON;
@ -447,15 +445,15 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = dst_stride_argb = 0;
}
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
}
}
#elif defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
@ -497,15 +495,15 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
src_stride_raw = dst_stride_argb = 0;
}
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_SSSE3;
}
}
#elif defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToARGBRow = RAWToARGBRow_NEON;
@ -547,15 +545,15 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = dst_stride_argb = 0;
}
#if defined(HAS_RGB565TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
}
}
#elif defined(HAS_RGB565TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_RGB565TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB565ToARGBRow = RGB565ToARGBRow_NEON;
@ -597,15 +595,15 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
}
}
#elif defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
@ -647,15 +645,15 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = dst_stride_argb = 0;
}
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
}
}
#elif defined(HAS_ARGB4444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGB4444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
@ -693,17 +691,23 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV12TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_NV12TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV12ToARGBRow = NV12ToARGBRow_AVX2;
}
#elif defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
@ -744,18 +748,23 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV21TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV21ToARGBRow = NV21ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_NV21TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV21ToARGBRow = NV21ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_NV21TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToARGBRow = NV21ToARGBRow_NEON;
@ -795,17 +804,23 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_NV12TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
NV12ToARGBRow = NV12ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_NV12TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV12ToARGBRow = NV12ToARGBRow_AVX2;
}
#elif defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_NV12TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToARGBRow = NV12ToARGBRow_NEON;
@ -852,19 +867,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = dst_stride_argb = 0;
}
#if defined(HAS_YUY2TOARGBROW_SSSE3)
// Posix is 16, Windows is 8.
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
}
#elif defined(HAS_YUY2TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_YUY2TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YUY2ToARGBRow = YUY2ToARGBRow_NEON;
@ -905,19 +924,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = dst_stride_argb = 0;
}
#if defined(HAS_UYVYTOARGBROW_SSSE3)
// Posix is 16, Windows is 8.
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
UYVYToARGBRow = UYVYToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
UYVYToARGBRow = UYVYToARGBRow_AVX2;
}
#elif defined(HAS_UYVYTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_UYVYTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
UYVYToARGBRow = UYVYToARGBRow_NEON;
@ -932,6 +955,152 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
return 0;
}
// Convert J420 to ARGB.
LIBYUV_API
int J420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
return 0;
}
// Convert J422 to ARGB.
LIBYUV_API
int J422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*J422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = J422ToARGBRow_C;
if (!src_y || !src_u || !src_v ||
!dst_argb ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
#if defined(HAS_J422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_J422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J422ToARGBRow = J422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J422ToARGBRow = J422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
J422ToARGBRow = J422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
J422ToARGBRow = J422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
}
#endif
for (y = 0; y < height; ++y) {
J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
src_u += src_stride_u;
src_v += src_stride_v;
}
return 0;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -13,7 +13,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/convert.h" // For I420Copy
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/scale.h" // For ScalePlane()
@ -174,14 +173,15 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
}
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
#elif defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@ -220,14 +220,15 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
dst_stride_yuy2 = -dst_stride_yuy2;
}
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
#elif defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@ -280,14 +281,15 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
}
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
#elif defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@ -326,14 +328,15 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
dst_stride_uyvy = -dst_stride_uyvy;
}
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
#elif defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@ -397,20 +400,15 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_stride_u = src_stride_v = dst_stride_uv = 0;
}
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUVRow_ = MergeUVRow_SSE2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@ -418,7 +416,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@ -476,18 +474,15 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
dst_stride_argb = -dst_stride_argb;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@ -495,7 +490,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@ -548,23 +543,30 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
dst_stride_bgra = -dst_stride_bgra;
}
#if defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToBGRARow = I422ToBGRARow_AVX2;
}
#elif defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
#endif
#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@ -610,17 +612,23 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
dst_stride_abgr = -dst_stride_abgr;
}
#if defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToABGRRow = I422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToABGRRow = I422ToABGRRow_AVX2;
}
#elif defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
@ -664,17 +672,23 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
dst_stride_rgba = -dst_stride_rgba;
}
#if defined(HAS_I422TORGBAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TORGBAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGBARow = I422ToRGBARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRGBARow = I422ToRGBARow_AVX2;
}
#elif defined(HAS_I422TORGBAROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
}
#endif
#if defined(HAS_I422TORGBAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGBARow = I422ToRGBARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_NEON;
@ -718,14 +732,15 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
dst_stride_rgb24 = -dst_stride_rgb24;
}
#if defined(HAS_I422TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
#elif defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGB24Row = I422ToRGB24Row_NEON;
@ -769,14 +784,15 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
dst_stride_raw = -dst_stride_raw;
}
#if defined(HAS_I422TORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_SSSE3;
}
}
#elif defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRAWRow = I422ToRAWRow_NEON;
@ -820,14 +836,23 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
dst_stride_argb1555 = -dst_stride_argb1555;
}
#if defined(HAS_I422TOARGB1555ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
}
}
#elif defined(HAS_I422TOARGB1555ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I422TOARGB1555ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGB1555ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGB1555Row = I422ToARGB1555Row_NEON;
@ -872,14 +897,23 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
dst_stride_argb4444 = -dst_stride_argb4444;
}
#if defined(HAS_I422TOARGB4444ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
}
}
#elif defined(HAS_I422TOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I422TOARGB4444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGB4444Row = I422ToARGB4444Row_NEON;
@ -923,14 +957,23 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_I422TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGB565Row = I422ToRGB565Row_SSSE3;
}
}
#elif defined(HAS_I422TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_I422TORGB565ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRGB565Row = I422ToRGB565Row_AVX2;
}
}
#endif
#if defined(HAS_I422TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToRGB565Row = I422ToRGB565Row_NEON;
@ -1054,38 +1097,6 @@ int ConvertFromI420(const uint8* y, int y_stride,
dst_sample_stride ? dst_sample_stride : width * 4,
width, height);
break;
case FOURCC_BGGR:
r = I420ToBayerBGGR(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
break;
case FOURCC_GBRG:
r = I420ToBayerGBRG(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
break;
case FOURCC_GRBG:
r = I420ToBayerGRBG(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
break;
case FOURCC_RGGB:
r = I420ToBayerRGGB(y, y_stride,
u, u_stride,
v, v_stride,
dst_sample,
dst_sample_stride ? dst_sample_stride : width,
width, height);
break;
case FOURCC_I400:
r = I400Copy(y, y_stride,
dst_sample,
@ -1116,7 +1127,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
width, height);
break;
}
// TODO(fbarchard): Add M420 and Q420.
// TODO(fbarchard): Add M420.
// Triplanar formats
// TODO(fbarchard): halfstride instead of halfwidth
case FOURCC_I420:

View File

@ -12,7 +12,6 @@
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#include "libyuv/planar_functions.h"
#include "libyuv/row.h"
@ -51,17 +50,15 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV444ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUV444Row = ARGBToUV444Row_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToUV444Row = ARGBToUV444Row_NEON;
@ -69,19 +66,16 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -130,17 +124,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@ -149,18 +141,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -209,19 +198,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
@ -229,7 +214,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -237,7 +222,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUV411ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
if (IS_ALIGNED(width, 32)) {
ARGBToUV411Row = ARGBToUV411Row_NEON;
@ -281,22 +266,17 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -304,7 +284,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@ -312,18 +292,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUVRow_ = MergeUVRow_SSE2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@ -331,7 +308,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@ -388,22 +365,17 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -411,7 +383,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
@ -419,18 +391,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
MergeUVRow_ = MergeUVRow_SSE2;
}
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2;
@ -438,7 +407,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow_ = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_NEON;
@ -500,17 +469,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_yuy2 = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@ -518,17 +485,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -537,14 +502,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_SSE2;
}
}
#elif defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToYUY2Row = I422ToYUY2Row_NEON;
@ -602,17 +568,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_uyvy = 0;
}
#if defined(HAS_ARGBTOUV422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUV422Row = ARGBToUV422Row_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
@ -620,17 +584,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -639,14 +601,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_SSE2;
}
}
#elif defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToUYVYRow = I422ToUYVYRow_NEON;
@ -697,19 +660,15 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_y = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
@ -717,7 +676,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
@ -773,14 +732,15 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_rgb24 = 0;
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
}
}
#elif defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB24Row = ARGBToRGB24Row_NEON;
@ -820,14 +780,15 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_raw = 0;
}
#if defined(HAS_ARGBTORAWROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
}
}
#elif defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRAWRow = ARGBToRAWRow_NEON;
@ -843,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
return 0;
}
static const uint8 kDither8x8[64] = {
0, 128, 32, 160, 8, 136, 40, 168,
192, 64, 224, 96, 200, 72, 232, 104,
48, 176, 16, 144, 56, 184, 24, 152,
240, 112, 208, 80, 248, 120, 216, 88,
12, 140, 44, 172, 4, 132, 36, 164,
204, 76, 236, 108, 196, 68, 228, 100,
60, 188, 28, 156, 52, 180, 20, 148,
252, 124, 220, 92, 244, 116, 212, 84,
};
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
if (!dither8x8) {
dither8x8 = kDither8x8;
}
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
dither8x8 + ((y & 7) << 3), width);
src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565;
}
return 0;
}
// Convert ARGB To RGB565.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
@ -867,15 +868,23 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_rgb565 = 0;
}
#if defined(HAS_ARGBTORGB565ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
}
}
#elif defined(HAS_ARGBTORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565Row = ARGBToRGB565Row_NEON;
@ -915,15 +924,23 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb1555 = 0;
}
#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
}
}
#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOARGB1555ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
@ -963,15 +980,23 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb4444 = 0;
}
#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
}
}
#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOARGB4444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
@ -1011,23 +1036,17 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
@ -1035,7 +1054,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
@ -1043,7 +1062,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
@ -1067,6 +1086,80 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
return 0;
}
// ARGB little endian (bgra in memory) to J422
LIBYUV_API
int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
int y;
void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUVJ422Row_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYJRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJ422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
ARGBToYJRow(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_y += dst_stride_y;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
return 0;
}
// Convert ARGB to J400.
LIBYUV_API
int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
@ -1091,19 +1184,15 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_yj = 0;
}
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
@ -1111,7 +1200,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;

View File

@ -11,7 +11,6 @@
#include "libyuv/convert_argb.h"
#include "libyuv/cpu_id.h"
#include "libyuv/format_conversion.h"
#ifdef HAVE_JPEG
#include "libyuv/mjpeg_decoder.h"
#endif
@ -144,36 +143,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// TODO(fbarchard): Support cropping Bayer by odd numbers
// by adjusting fourcc.
case FOURCC_BGGR:
src = sample + (src_width * crop_y + crop_x);
r = BayerBGGRToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GBRG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGBRGToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GRBG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGRBGToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGGB:
src = sample + (src_width * crop_y + crop_x);
r = BayerRGGBToARGB(src, src_width,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToARGB(src, src_width,
@ -205,15 +174,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
// case FOURCC_Q420:
// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
// src_width + crop_x * 2;
// r = Q420ToARGB(src, src_width * 3,
// src_uv, src_width * 3,
// crop_argb, argb_stride,
// crop_width, inv_crop_height);
// break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:
@ -241,6 +201,25 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
crop_width, inv_crop_height);
break;
}
case FOURCC_J420: {
const uint8* src_y = sample + (src_width * crop_y + crop_x);
const uint8* src_u;
const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
src_u = sample + src_width * abs_src_height +
(halfwidth * crop_y + crop_x) / 2;
src_v = sample + src_width * abs_src_height +
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width,
src_u, halfwidth,
src_v, halfwidth,
crop_argb, argb_stride,
crop_width, inv_crop_height);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
const uint8* src_y = sample + src_width * crop_y + crop_x;

View File

@ -12,7 +12,6 @@
#include "libyuv/convert.h"
#include "libyuv/format_conversion.h"
#include "libyuv/video_common.h"
#ifdef __cplusplus
@ -173,40 +172,6 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
// TODO(fbarchard): Support cropping Bayer by odd numbers
// by adjusting fourcc.
case FOURCC_BGGR:
src = sample + (src_width * crop_y + crop_x);
r = BayerBGGRToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GBRG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGBRGToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_GRBG:
src = sample + (src_width * crop_y + crop_x);
r = BayerGRBGToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_RGGB:
src = sample + (src_width * crop_y + crop_x);
r = BayerRGGBToI420(src, src_width,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
r = I400ToI420(src, src_width,
@ -218,7 +183,8 @@ int ConvertToI420(const uint8* sample,
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
y, y_stride,
@ -228,7 +194,8 @@ int ConvertToI420(const uint8* sample,
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
src_uv = sample + (src_width * src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
// Call NV12 but with u and v parameters swapped.
r = NV12ToI420Rotate(src, src_width,
src_uv, aligned_src_width,
@ -245,17 +212,6 @@ int ConvertToI420(const uint8* sample,
v, v_stride,
crop_width, inv_crop_height);
break;
case FOURCC_Q420:
src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
src_width + crop_x * 2;
r = Q420ToI420(src, src_width * 3,
src_uv, src_width * 3,
y, y_stride,
u, u_stride,
v, v_stride,
crop_width, inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YU12:

View File

@ -52,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER) && !defined(__clang__)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
#endif
#if defined(_M_IX86)
__asm {
mov eax, info_eax
mov ecx, info_ecx
@ -98,13 +99,15 @@ int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) && defined(_MSC_VER)
#endif
#if defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
#endif
#if defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
@ -135,6 +138,12 @@ int ArmCpuCaps(const char* cpuinfo_name) {
fclose(f);
return kCpuHasNEON;
}
// aarch64 uses asimd for Neon.
p = strstr(cpuinfo_line, " asimd");
if (p && (p[6] == ' ' || p[6] == '\n')) {
fclose(f);
return kCpuHasNEON;
}
}
}
fclose(f);
@ -240,7 +249,8 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
cpu_info_ &= ~kCpuHasFMA3;
}
#elif defined(__mips__) && defined(__linux__)
#endif
#if defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
#if defined(__mips_dspr2)
@ -257,7 +267,8 @@ int InitCpuFlags(void) {
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
#elif defined(__arm__) || defined(__aarch64__)
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
@ -266,7 +277,8 @@ int InitCpuFlags(void) {
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#elif defined(__aarch64__)
#endif
#if defined(__aarch64__)
cpu_info_ = kCpuHasNEON;
#else
// Linux arm parse text file for neon detect.

View File

@ -1,554 +0,0 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/format_conversion.h"
#include "libyuv/basic_types.h"
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// generate a selector mask useful for pshufb
static uint32 GenerateSelector(int select0, int select1) {
return (uint32)(select0) |
(uint32)((select1 + 4) << 8) |
(uint32)((select0 + 8) << 16) |
(uint32)((select1 + 12) << 24);
}
static int MakeSelectors(const int blue_index,
const int green_index,
const int red_index,
uint32 dst_fourcc_bayer,
uint32* index_map) {
// Now build a lookup table containing the indices for the four pixels in each
// 2x2 Bayer grid.
switch (dst_fourcc_bayer) {
case FOURCC_BGGR:
index_map[0] = GenerateSelector(blue_index, green_index);
index_map[1] = GenerateSelector(green_index, red_index);
break;
case FOURCC_GBRG:
index_map[0] = GenerateSelector(green_index, blue_index);
index_map[1] = GenerateSelector(red_index, green_index);
break;
case FOURCC_RGGB:
index_map[0] = GenerateSelector(red_index, green_index);
index_map[1] = GenerateSelector(green_index, blue_index);
break;
case FOURCC_GRBG:
index_map[0] = GenerateSelector(green_index, red_index);
index_map[1] = GenerateSelector(blue_index, green_index);
break;
default:
return -1; // Bad FourCC
}
return 0;
}
// Converts 32 bit ARGB to Bayer RGB formats.
LIBYUV_API
int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height,
uint32 dst_fourcc_bayer) {
int y;
const int blue_index = 0; // Offsets for ARGB format
const int green_index = 1;
const int red_index = 2;
uint32 index_map[2];
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) = ARGBToBayerRow_C;
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
#endif
if (MakeSelectors(blue_index, green_index, red_index,
dst_fourcc_bayer, index_map)) {
return -1; // Bad FourCC
}
for (y = 0; y < height; ++y) {
ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
src_argb += src_stride_argb;
dst_bayer += dst_stride_bayer;
}
return 0;
}
#define AVG(a, b) (((a) + (b)) >> 1)
static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 r = src_bayer1[1];
int x;
for (x = 0; x < pix - 2; x += 2) {
dst_argb[0] = src_bayer0[0];
dst_argb[1] = AVG(g, src_bayer0[1]);
dst_argb[2] = AVG(r, src_bayer1[1]);
dst_argb[3] = 255U;
dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
dst_argb[5] = src_bayer0[1];
dst_argb[6] = src_bayer1[1];
dst_argb[7] = 255U;
g = src_bayer0[1];
r = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_argb += 8;
}
dst_argb[0] = src_bayer0[0];
dst_argb[1] = AVG(g, src_bayer0[1]);
dst_argb[2] = AVG(r, src_bayer1[1]);
dst_argb[3] = 255U;
if (!(pix & 1)) {
dst_argb[4] = src_bayer0[0];
dst_argb[5] = src_bayer0[1];
dst_argb[6] = src_bayer1[1];
dst_argb[7] = 255U;
}
}
static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 g = src_bayer0[1];
uint8 b = src_bayer1[1];
int x;
for (x = 0; x < pix - 2; x += 2) {
dst_argb[0] = AVG(b, src_bayer1[1]);
dst_argb[1] = AVG(g, src_bayer0[1]);
dst_argb[2] = src_bayer0[0];
dst_argb[3] = 255U;
dst_argb[4] = src_bayer1[1];
dst_argb[5] = src_bayer0[1];
dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
dst_argb[7] = 255U;
g = src_bayer0[1];
b = src_bayer1[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_argb += 8;
}
dst_argb[0] = AVG(b, src_bayer1[1]);
dst_argb[1] = AVG(g, src_bayer0[1]);
dst_argb[2] = src_bayer0[0];
dst_argb[3] = 255U;
if (!(pix & 1)) {
dst_argb[4] = src_bayer1[1];
dst_argb[5] = src_bayer0[1];
dst_argb[6] = src_bayer0[0];
dst_argb[7] = 255U;
}
}
static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 b = src_bayer0[1];
int x;
for (x = 0; x < pix - 2; x += 2) {
dst_argb[0] = AVG(b, src_bayer0[1]);
dst_argb[1] = src_bayer0[0];
dst_argb[2] = src_bayer1[0];
dst_argb[3] = 255U;
dst_argb[4] = src_bayer0[1];
dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
dst_argb[7] = 255U;
b = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_argb += 8;
}
dst_argb[0] = AVG(b, src_bayer0[1]);
dst_argb[1] = src_bayer0[0];
dst_argb[2] = src_bayer1[0];
dst_argb[3] = 255U;
if (!(pix & 1)) {
dst_argb[4] = src_bayer0[1];
dst_argb[5] = src_bayer0[0];
dst_argb[6] = src_bayer1[0];
dst_argb[7] = 255U;
}
}
static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
uint8* dst_argb, int pix) {
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
uint8 r = src_bayer0[1];
int x;
for (x = 0; x < pix - 2; x += 2) {
dst_argb[0] = src_bayer1[0];
dst_argb[1] = src_bayer0[0];
dst_argb[2] = AVG(r, src_bayer0[1]);
dst_argb[3] = 255U;
dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
dst_argb[6] = src_bayer0[1];
dst_argb[7] = 255U;
r = src_bayer0[1];
src_bayer0 += 2;
src_bayer1 += 2;
dst_argb += 8;
}
dst_argb[0] = src_bayer1[0];
dst_argb[1] = src_bayer0[0];
dst_argb[2] = AVG(r, src_bayer0[1]);
dst_argb[3] = 255U;
if (!(pix & 1)) {
dst_argb[4] = src_bayer1[0];
dst_argb[5] = src_bayer0[0];
dst_argb[6] = src_bayer0[1];
dst_argb[7] = 255U;
}
}
// Converts any Bayer RGB format to ARGB.
LIBYUV_API
int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int dst_stride_argb,
int width, int height,
uint32 src_fourcc_bayer) {
int y;
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int pix);
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
switch (src_fourcc_bayer) {
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
case FOURCC_GBRG:
BayerRow0 = BayerRowGB;
BayerRow1 = BayerRowRG;
break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
case FOURCC_RGGB:
BayerRow0 = BayerRowRG;
BayerRow1 = BayerRowGB;
break;
default:
return -1; // Bad FourCC
}
for (y = 0; y < height - 1; y += 2) {
BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
dst_argb + dst_stride_argb, width);
src_bayer += src_stride_bayer * 2;
dst_argb += dst_stride_argb * 2;
}
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
}
return 0;
}
// Converts any Bayer RGB format to ARGB.
LIBYUV_API
int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height,
uint32 src_fourcc_bayer) {
void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int pix);
void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_argb, int pix);
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
// Negative height means invert the image.
if (height < 0) {
int halfheight;
height = -height;
halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
ARGBToUVRow = ARGBToUVRow_SSSE3;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
}
#elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
switch (src_fourcc_bayer) {
case FOURCC_BGGR:
BayerRow0 = BayerRowBG;
BayerRow1 = BayerRowGR;
break;
case FOURCC_GBRG:
BayerRow0 = BayerRowGB;
BayerRow1 = BayerRowRG;
break;
case FOURCC_GRBG:
BayerRow0 = BayerRowGR;
BayerRow1 = BayerRowBG;
break;
case FOURCC_RGGB:
BayerRow0 = BayerRowRG;
BayerRow1 = BayerRowGB;
break;
default:
return -1; // Bad FourCC
}
{
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
int y;
for (y = 0; y < height - 1; y += 2) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
row + kRowSize, width);
ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
src_bayer += src_stride_bayer * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
BayerRow0(src_bayer, src_stride_bayer, row, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
}
free_aligned_buffer_64(row);
}
return 0;
}
// Convert I420 to Bayer.
LIBYUV_API
int I420ToBayer(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_bayer, int dst_stride_bayer,
int width, int height,
uint32 dst_fourcc_bayer) {
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) = ARGBToBayerRow_C;
const int blue_index = 0; // Offsets for ARGB format
const int green_index = 1;
const int red_index = 2;
uint32 index_map[2];
// Negative height means invert the image.
if (height < 0) {
int halfheight;
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
}
}
#elif defined(HAS_ARGBTOBAYERROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_NEON;
}
}
#endif
if (MakeSelectors(blue_index, green_index, red_index,
dst_fourcc_bayer, index_map)) {
return -1; // Bad FourCC
}
{
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
int y;
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row, width);
ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
dst_bayer += dst_stride_bayer;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
free_aligned_buffer_64(row);
}
return 0;
}
#define MAKEBAYERFOURCC(BAYER) \
LIBYUV_API \
int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
uint8* dst_y, int dst_stride_y, \
uint8* dst_u, int dst_stride_u, \
uint8* dst_v, int dst_stride_v, \
int width, int height) { \
return BayerToI420(src_bayer, src_stride_bayer, \
dst_y, dst_stride_y, \
dst_u, dst_stride_u, \
dst_v, dst_stride_v, \
width, height, \
FOURCC_##BAYER); \
} \
\
LIBYUV_API \
int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
const uint8* src_u, int src_stride_u, \
const uint8* src_v, int src_stride_v, \
uint8* dst_bayer, int dst_stride_bayer, \
int width, int height) { \
return I420ToBayer(src_y, src_stride_y, \
src_u, src_stride_u, \
src_v, src_stride_v, \
dst_bayer, dst_stride_bayer, \
width, height, \
FOURCC_##BAYER); \
} \
\
LIBYUV_API \
int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
uint8* dst_bayer, int dst_stride_bayer, \
int width, int height) { \
return ARGBToBayer(src_argb, src_stride_argb, \
dst_bayer, dst_stride_bayer, \
width, height, \
FOURCC_##BAYER); \
} \
\
LIBYUV_API \
int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
uint8* dst_argb, int dst_stride_argb, \
int width, int height) { \
return BayerToARGB(src_bayer, src_stride_bayer, \
dst_argb, dst_stride_argb, \
width, height, \
FOURCC_##BAYER); \
}
MAKEBAYERFOURCC(BGGR)
MAKEBAYERFOURCC(GBRG)
MAKEBAYERFOURCC(GRBG)
MAKEBAYERFOURCC(RGGB)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -10,15 +10,66 @@
#include "libyuv/mjpeg_decoder.h"
#include <string.h> // For memchr.
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Enable this to try scasb implementation.
// #define ENABLE_SCASB 1
#ifdef ENABLE_SCASB
// Multiple of 1.
__declspec(naked) __declspec(align(16))
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // src
mov eax, [esp + 8] // val
mov ecx, [esp + 12] // count
repne scasb
jne sr99
mov eax, edi
sub eax, 1
mov edi, edx
ret
sr99:
mov eax, 0
mov edi, edx
ret
}
}
#endif
// Helper function to scan for EOI marker.
static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
const uint8* end = sample + sample_size - 1;
const uint8* it = sample;
for (;;) {
#ifdef ENABLE_SCASB
it = ScanRow_ERMS(it, 0xff, end - it);
#else
it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
#endif
if (it == NULL) {
break;
}
if (it[1] == 0xd9) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
++it; // Skip over current 0xff.
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
size_t i;
const size_t kBackSearchSize = 1024;
if (sample_size < 64) {
// ERROR: Invalid jpeg size: sample_size
return LIBYUV_FALSE;
@ -27,17 +78,20 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
for (i = sample_size - 2; i > 1;) {
if (sample[i] != 0xd9) {
if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
// Step over SOI marker.
sample += 2;
sample_size -= 2;
// Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
if (sample_size > kBackSearchSize) {
if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
--i;
// Reduce search size for forward search.
sample_size = sample_size - kBackSearchSize + 1;
}
--i;
}
// ERROR: Invalid jpeg end code not found. Size sample_size
return LIBYUV_FALSE;
return ScanEOI(sample, sample_size);
}
#ifdef __cplusplus

View File

@ -41,16 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) {
return;
}
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
CopyRow = CopyRow_SSE2;
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@ -59,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
@ -90,15 +88,8 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_COPYROW_16_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_16_X86;
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
}
#endif
@ -239,26 +230,44 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
src_stride_y = -src_stride_y;
}
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorRow = MirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_AVX2;
}
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
// Mirror plane
@ -298,23 +307,17 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -324,7 +327,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
}
#endif
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (width >= 16) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
@ -376,23 +379,17 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_UYVYTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUV422Row = UYVYToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
}
}
#endif
#if defined(HAS_UYVYTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -402,7 +399,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
}
#endif
#if defined(HAS_UYVYTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
UYVYToYRow = UYVYToYRow_Any_NEON;
if (width >= 16) {
UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
@ -497,22 +494,28 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBMIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
#endif
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
#endif
@ -614,7 +617,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
@ -622,7 +625,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@ -630,7 +633,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBMULTIPLYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_NEON;
@ -680,7 +683,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAddRow = ARGBAddRow_SSE2;
@ -688,7 +691,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2;
@ -696,7 +699,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBAddRow = ARGBAddRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_NEON;
@ -741,7 +744,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSUBTRACTROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBSubtractRow = ARGBSubtractRow_SSE2;
@ -749,7 +752,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2;
@ -757,7 +760,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_ARGBSUBTRACTROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_NEON;
@ -808,24 +811,31 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
}
#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#elif defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
#if defined(HAS_I422TOBGRAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToBGRARow = I422ToBGRARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToBGRARow = I422ToBGRARow_AVX2;
}
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
}
#endif
#if defined(HAS_I422TOBGRAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToBGRARow = I422ToBGRARow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_NEON;
}
}
#endif
#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@ -879,21 +889,27 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
}
#if defined(HAS_I422TOABGRROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_NEON;
}
}
#elif defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
#endif
#if defined(HAS_I422TOABGRROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOABGRROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToABGRRow = I422ToABGRRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToABGRRow = I422ToABGRRow_AVX2;
}
}
#endif
@ -941,21 +957,27 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
}
#if defined(HAS_I422TORGBAROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_NEON;
}
}
#elif defined(HAS_I422TORGBAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
#endif
#if defined(HAS_I422TORGBAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
#if defined(HAS_I422TORGBAROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGBARow = I422ToRGBARow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRGBARow = I422ToRGBARow_AVX2;
}
}
#endif
@ -991,14 +1013,23 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_NV12TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
}
}
#elif defined(HAS_NV12TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_NV12TORGB565ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
}
}
#endif
#if defined(HAS_NV12TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV12ToRGB565Row = NV12ToRGB565Row_NEON;
@ -1039,14 +1070,23 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
dst_stride_rgb565 = -dst_stride_rgb565;
}
#if defined(HAS_NV21TORGB565ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
}
}
#elif defined(HAS_NV21TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
#endif
#if defined(HAS_NV21TORGB565ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
}
}
#endif
#if defined(HAS_NV21TORGB565ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
NV21ToRGB565Row = NV21ToRGB565Row_NEON;
@ -1070,8 +1110,12 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value) {
int y;
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
// Coalesce rows.
if (dst_stride_y == width) {
width *= height;
@ -1079,21 +1123,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasNEON)) {
SetRow = SetRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
}
#endif
#if defined(HAS_SETROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
if (TestCpuFlag(kCpuHasX86)) {
SetRow = SetRow_Any_X86;
if (IS_ALIGNED(width, 4)) {
SetRow = SetRow_X86;
}
}
#endif
#if defined(HAS_SETROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
SetRow = SetRow_ERMS;
}
#endif
// Set plane
for (y = 0; y < height; ++y) {
SetRow(dst_y, v32, width);
SetRow(dst_y, value, width);
dst_y += dst_stride_y;
}
}
@ -1112,7 +1165,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height <= 0 ||
width <= 0 || height == 0 ||
x < 0 || y < 0 ||
value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 ||
@ -1132,11 +1185,18 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y,
int width, int height,
uint32 value) {
int y;
void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb ||
width <= 0 || height <= 0 ||
width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) {
return -1;
}
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
dst_argb += dst_y * dst_stride_argb + dst_x * 4;
// Coalesce rows.
if (dst_stride_argb == width * 4) {
@ -1144,20 +1204,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
height = 1;
dst_stride_argb = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
#if defined(HAS_ARGBSETROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBSetRow = ARGBSetRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBSetRow = ARGBSetRow_NEON;
}
}
#endif
#if defined(HAS_SETROW_X86)
#if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
return 0;
ARGBSetRow = ARGBSetRow_X86;
}
#endif
ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
// Set plane
for (y = 0; y < height; ++y) {
ARGBSetRow(dst_argb, value, width);
dst_argb += dst_stride_argb;
}
return 0;
}
@ -1197,9 +1263,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@ -1207,7 +1271,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@ -1215,7 +1279,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@ -1223,7 +1287,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_NEON;
@ -1263,7 +1327,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@ -1271,7 +1335,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@ -1312,12 +1376,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
#endif
#if defined(HAS_ARGBGRAYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON;
}
@ -1350,11 +1413,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
#endif
#if defined(HAS_ARGBGRAYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON;
}
@ -1383,11 +1446,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSEPIAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
#elif defined(HAS_ARGBSEPIAROW_NEON)
#endif
#if defined(HAS_ARGBSEPIAROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
@ -1425,11 +1488,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
#endif
#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
@ -1568,11 +1631,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
#elif defined(HAS_ARGBQUANTIZEROW_NEON)
#endif
#if defined(HAS_ARGBQUANTIZEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBQuantizeRow = ARGBQuantizeRow_NEON;
}
@ -1743,12 +1806,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHADEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
#elif defined(HAS_ARGBSHADEROW_NEON)
#endif
#if defined(HAS_ARGBSHADEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
ARGBShadeRow = ARGBShadeRow_NEON;
}
@ -1790,33 +1852,23 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@ -1824,19 +1876,19 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
#endif
@ -1876,7 +1928,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_stride_bgra = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHUFFLEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBShuffleRow = ARGBShuffleRow_SSE2;
@ -1884,19 +1936,15 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
}
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBShuffleRow = ARGBShuffleRow_AVX2;
@ -1904,7 +1952,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
#endif
#if defined(HAS_ARGBSHUFFLEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBShuffleRow = ARGBShuffleRow_NEON;
@ -1947,8 +1995,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
@ -1956,8 +2003,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@ -1965,7 +2011,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_NEON;
@ -2048,8 +2094,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
#endif
@ -2070,8 +2115,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
#endif
@ -2093,8 +2137,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
@ -2218,10 +2261,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
@ -2264,10 +2304,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif

View File

@ -42,11 +42,7 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
#define HAS_MIRRORROW_UV_NEON
void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE_WX8_NEON
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
@ -55,7 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif // defined(__ARM_NEON__)
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
@ -194,31 +190,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqa xmm0, [eax]
movdqa xmm1, [eax + edi]
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqa xmm2, [eax]
movdqa xmm3, [eax + edi]
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqa xmm4, [eax]
movdqa xmm5, [eax + edi]
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqa xmm6, [eax]
movdqa xmm7, [eax + edi]
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa [esp], xmm5 // backup xmm5
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
@ -239,8 +235,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqa xmm5, [esp] // restore xmm5
movdqa [esp], xmm6 // backup xmm6
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
@ -251,7 +247,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqa xmm6, [esp] // restore xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
@ -296,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
ret
}
}
#elif !defined(LIBYUV_DISABLE_X86) && \
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
@ -379,10 +376,8 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -411,31 +406,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqa (%eax),%xmm0 \n"
"movdqa (%eax,%edi,1),%xmm1 \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqa (%eax),%xmm2 \n"
"movdqa (%eax,%edi,1),%xmm3 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqa (%eax),%xmm4 \n"
"movdqa (%eax,%edi,1),%xmm5 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqa (%eax),%xmm6 \n"
"movdqa (%eax,%edi,1),%xmm7 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm5,(%esp) \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
@ -455,8 +450,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqa (%esp),%xmm5 \n"
"movdqa %xmm6,(%esp) \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
@ -465,7 +460,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqa (%esp),%xmm6 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
@ -514,7 +509,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"ret \n"
#endif
);
#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
@ -525,38 +521,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa (%0),%%xmm2 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqa (%0,%3),%%xmm3 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqa (%0),%%xmm4 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqa (%0,%3),%%xmm5 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqa (%0),%%xmm6 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqa (%0,%3),%%xmm7 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@ -666,29 +662,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqa (%0),%%xmm2 \n"
"movdqa (%0,%4),%%xmm3 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqa (%0),%%xmm4 \n"
"movdqa (%0,%4),%%xmm5 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqa (%0),%%xmm6 \n"
"movdqa (%0,%4),%%xmm7 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
@ -818,9 +814,7 @@ void TransposePlane(const uint8* src, int src_stride,
}
#endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
}
#endif
@ -883,29 +877,38 @@ void RotatePlane180(const uint8* src, int src_stride,
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
MirrorRow = MirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSE2;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorRow = MirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_AVX2;
}
}
#endif
// TODO(fbarchard): Mirror on mips handle unaligned memory.
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
@ -913,21 +916,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@ -935,6 +931,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;
@ -1010,13 +1011,13 @@ void TransposeUV(const uint8* src, int src_stride,
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
}
#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(width, 8) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
#endif
#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@ -1084,12 +1085,13 @@ void RotateUV180(const uint8* src, int src_stride,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorRowUV = MirrorUVRow_NEON;
}
#elif defined(HAS_MIRRORROW_UV_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_MIRRORROW_UV_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRowUV = MirrorUVRow_SSSE3;
}
#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
#endif
#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
MirrorRowUV = MirrorUVRow_MIPS_DSPR2;

View File

@ -31,7 +31,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
uint8* dst_ptr, int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx,
@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
IS_ALIGNED(src, 4)) {
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
}
#endif
@ -102,38 +101,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
ARGBMirrorRow_C;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_ARGBMIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_AVX2;
}
#endif
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
CopyRow = CopyRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
}
#endif
#if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
}
#endif
#if defined(HAS_COPYROW_ERMS)
@ -141,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS;
}
#endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS;

View File

@ -17,7 +17,8 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
@ -525,7 +526,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
);
}
#endif
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"

View File

@ -0,0 +1,543 @@
/*
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %3, %3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
"trn1 v17.8b, v0.8b, v1.8b \n"
"trn2 v18.8b, v2.8b, v3.8b \n"
"trn1 v19.8b, v2.8b, v3.8b \n"
"trn2 v20.8b, v4.8b, v5.8b \n"
"trn1 v21.8b, v4.8b, v5.8b \n"
"trn2 v22.8b, v6.8b, v7.8b \n"
"trn1 v23.8b, v6.8b, v7.8b \n"
"trn2 v3.4h, v17.4h, v19.4h \n"
"trn1 v1.4h, v17.4h, v19.4h \n"
"trn2 v2.4h, v16.4h, v18.4h \n"
"trn1 v0.4h, v16.4h, v18.4h \n"
"trn2 v7.4h, v21.4h, v23.4h \n"
"trn1 v5.4h, v21.4h, v23.4h \n"
"trn2 v6.4h, v20.4h, v22.4h \n"
"trn1 v4.4h, v20.4h, v22.4h \n"
"trn2 v21.2s, v1.2s, v5.2s \n"
"trn1 v17.2s, v1.2s, v5.2s \n"
"trn2 v20.2s, v0.2s, v4.2s \n"
"trn1 v16.2s, v0.2s, v4.2s \n"
"trn2 v23.2s, v3.2s, v7.2s \n"
"trn1 v19.2s, v3.2s, v7.2s \n"
"trn2 v22.2s, v2.2s, v6.2s \n"
"trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
"subs %3, %3, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %3, %3, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %3, #2 \n"
"b.lt 3f \n"
"cmp %3, #4 \n"
"b.lt 2f \n"
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n"
MEMACCESS(4)
"ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
MEMACCESS(0)
"st1 {v3.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v0.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %3, %3, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %3, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v0.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v3.8b}, [%0], %6 \n"
MEMACCESS(0)
"st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %3, %3, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld1 {v0.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld1 {v0.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.8b}, [%2] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width64) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
}
static uint8 kVTbl4x4TransposeDi[32] =
{ 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %4, %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.16b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v17.d}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v20.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v20.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v22.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v21.d}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %4, %4, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %4, %4, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %4, #2 \n"
"b.lt 3f \n"
"cmp %4, #4 \n"
"b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
MEMACCESS(8)
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v16.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n"
MEMACCESS(0)
"st1 {v18.s}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[1], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[2], [%0], %6 \n"
MEMACCESS(0)
"st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v17.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n"
MEMACCESS(0)
"st1 {v19.s}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[1], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[2], [%0], %7 \n"
MEMACCESS(0)
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %4, %4, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %4, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
MEMACCESS(0)
"ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"mov %0, %2 \n"
MEMACCESS(0)
"st1 {v4.d}[0], [%0], %6 \n"
MEMACCESS(0)
"st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n"
MEMACCESS(0)
"st1 {v5.d}[0], [%0], %7 \n"
MEMACCESS(0)
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %4, %4, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
MEMACCESS(1)
"ld2 {v0.b, v1.b}[7], [%1] \n"
MEMACCESS(2)
"st1 {v0.d}[0], [%2] \n"
MEMACCESS(3)
"st1 {v1.d}[0], [%3] \n"
"4: \n"
: "+r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width64) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v30", "v31"
);
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -17,17 +17,14 @@ namespace libyuv {
extern "C" {
#endif
// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
// TODO(fbarchard): Consider 'any' functions handling odd alignment.
// YUV to RGB does multiple of 8 with SIMD and remainder with C.
#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, \
const uint8* u_buf, \
const uint8* v_buf, \
uint8* rgb_buf, \
int width) { \
void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
uint8* rgb_buf, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
} \
I420TORGB_C(y_buf + n, \
u_buf + (n >> UV_SHIFT), \
v_buf + (n >> UV_SHIFT), \
@ -35,36 +32,59 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
1, 4, 7)
#endif // HAS_I422TOARGBROW_SSSE3
#endif
#ifdef HAS_I444TOARGBROW_SSSE3
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
0, 4, 7)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C,
2, 4, 7)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C,
1, 4, 7)
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C,
1, 4, 7)
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C,
1, 4, 7)
// I422ToRGB565Row_SSSE3 is unaligned.
YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
1, 2, 7)
YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
1, 2, 7)
// I422ToRGB24Row_SSSE3 is unaligned.
YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_J422TOARGBROW_SSSE3
YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
1, 4, 7)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
#endif
#ifdef HAS_I422TOBGRAROW_AVX2
YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15)
#endif
#ifdef HAS_I422TORGBAROW_AVX2
YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15)
#endif
#ifdef HAS_I422TOABGRROW_AVX2
YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15)
#endif
#ifdef HAS_I422TOARGB4444ROW_AVX2
YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C,
1, 2, 7)
#endif
#ifdef HAS_I422TOARGB1555ROW_AVX2
YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C,
1, 2, 7)
#endif
#ifdef HAS_I422TORGB565ROW_AVX2
YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C,
1, 2, 7)
#endif
#ifdef HAS_I422TOARGBROW_NEON
YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
@ -79,214 +99,240 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
#endif // HAS_I422TOARGBROW_NEON
#endif
#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
#endif // HAS_I422TOYUY2ROW_NEON
#endif
#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOUYVYROW_NEON
#endif
#undef YANY
// Wrappers to handle odd width
#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
void NAMEANY(const uint8* y_buf, \
const uint8* uv_buf, \
uint8* rgb_buf, \
int width) { \
int n = width & ~7; \
#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \
uint8* rgb_buf, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
} \
NV12TORGB_C(y_buf + n, \
uv_buf + (n >> UV_SHIFT), \
rgb_buf + n * BPP, width & 7); \
rgb_buf + n * BPP, width & MASK); \
}
#ifdef HAS_NV12TOARGBROW_SSSE3
NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
0, 4)
NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
0, 4)
#endif // HAS_NV12TOARGBROW_SSSE3
NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7)
#endif
#ifdef HAS_NV12TOARGBROW_AVX2
NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15)
NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15)
#endif
#ifdef HAS_NV12TOARGBROW_NEON
NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
#endif // HAS_NV12TOARGBROW_NEON
NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7)
NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
0, 2)
0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
0, 2)
#endif // HAS_NV12TORGB565ROW_SSSE3
0, 2, 7)
#endif
#ifdef HAS_NV12TORGB565ROW_AVX2
NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C,
0, 2, 15)
NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C,
0, 2, 15)
#endif
#ifdef HAS_NV12TORGB565ROW_NEON
NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
#endif // HAS_NV12TORGB565ROW_NEON
NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C,
0, 2, 7)
NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C,
0, 2, 7)
#endif
#undef NVANY
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* src, \
uint8* dst, \
int width) { \
#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src, uint8* dst, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBTORGB_SIMD(src, dst, n); \
} \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
}
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
15, 4, 3)
4, 3, 15)
RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
15, 4, 3)
4, 3, 15)
RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
3, 4, 2)
4, 2, 3)
RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
3, 4, 2)
4, 2, 3)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
4, 2, 3)
#endif
#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C,
4, 2, 7)
RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C,
4, 2, 7)
RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
4, 2, 7)
#endif
#if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
#endif
#if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
15, 2, 4)
RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
15, 2, 4)
// These require alignment on ARGB, so C is used for remainder.
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7)
#endif
#if defined(HAS_YTOARGBROW_AVX2)
RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, 1, 4, 15)
#endif
#if defined(HAS_YUY2TOARGBROW_SSSE3)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 2, 4, 15)
RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 2, 4, 15)
RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
15, 3, 4)
RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
15, 3, 4)
3, 4, 15)
RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, 3, 4, 15)
RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
7, 2, 4)
2, 4, 7)
RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
7, 2, 4)
2, 4, 7)
RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
7, 2, 4)
2, 4, 7)
#endif
#if defined(HAS_YUY2TOARGBROW_AVX2)
RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31)
RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31)
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 4, 3, 7)
RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 4, 3, 7)
RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
7, 4, 2)
4, 2, 7)
RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
7, 4, 2)
4, 2, 7)
RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
7, 4, 2)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
7, 1, 4)
RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
7, 2, 4)
RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
7, 2, 4)
4, 2, 7)
RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 1, 4, 7)
RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, 1, 4, 7)
RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 2, 4, 7)
RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7)
#endif
#undef RGBANY
// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
void NAMEANY(const uint8* src, \
uint8* dst, uint32 selector, \
int width) { \
#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBTORGB_SIMD(src, dst, selector, n); \
} \
ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
}
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
7, 4, 1)
#endif
#if defined(HAS_ARGBTOBAYERROW_NEON)
BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
7, 4, 1)
#endif
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
7, 4, 1)
4, 1, 7)
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
7, 4, 1)
4, 1, 7)
#endif
#undef BAYERANY
// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
dst_y + (width - NUM) * BPP, NUM); \
}
#ifdef HAS_ARGBTOYROW_AVX2
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
#endif
#ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif
#undef YANY
#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBTOY_SIMD(src_argb, dst_y, n); \
} \
ARGBTOY_C(src_argb + n * SBPP, \
dst_y + n * BPP, width & MASK); \
}
// Attenuate is destructive so last16 method can not be used due to overlap.
#ifdef HAS_ARGBTOYROW_AVX2
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31)
#endif
#ifdef HAS_ARGBTOYJROW_AVX2
YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31)
#endif
#ifdef HAS_UYVYTOYROW_AVX2
YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31)
#endif
#ifdef HAS_YUY2TOYROW_AVX2
YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_SSSE3
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, BGRAToYRow_C, 4, 1, 15)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, ABGRToYRow_C, 4, 1, 15)
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, RGBAToYRow_C, 4, 1, 15)
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, YUY2ToYRow_C, 2, 1, 15)
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, UYVYToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_SSSE3
YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, ARGBToYJRow_C, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_NEON
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, ARGBToYRow_C, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYJROW_NEON
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, ARGBToYJRow_C, 4, 1, 7)
#endif
#ifdef HAS_BGRATOYROW_NEON
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, BGRAToYRow_C, 4, 1, 7)
#endif
#ifdef HAS_ABGRTOYROW_NEON
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, ABGRToYRow_C, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_NEON
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, RGBAToYRow_C, 4, 1, 7)
#endif
#ifdef HAS_RGB24TOYROW_NEON
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, RGB24ToYRow_C, 3, 1, 7)
#endif
#ifdef HAS_RAWTOYROW_NEON
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, RAWToYRow_C, 3, 1, 7)
#endif
#ifdef HAS_RGB565TOYROW_NEON
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, RGB565ToYRow_C, 2, 1, 7)
#endif
#ifdef HAS_ARGB1555TOYROW_NEON
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, ARGB1555ToYRow_C, 2, 1, 7)
#endif
#ifdef HAS_ARGB4444TOYROW_NEON
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, ARGB4444ToYRow_C, 2, 1, 7)
#endif
#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, YUY2ToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, UYVYToYRow_C, 2, 1, 15)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, RGB24ToARGBRow_C, 3, 4, 7)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, RAWToARGBRow_C, 3, 4, 7)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, RGB565ToARGBRow_C, 2, 4, 7)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, ARGB1555ToARGBRow_C,
2, 4, 7)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, ARGB4444ToARGBRow_C,
2, 4, 7)
#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
4, 4, 3)
@ -318,7 +364,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
} \
ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
dst_u + (n >> 1), \
dst_v + (n >> 1), \
@ -327,29 +375,50 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
#ifdef HAS_ARGBTOUVROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15)
#endif
#ifdef HAS_YUY2TOUVROW_AVX2
UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
4, 15)
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
#ifdef HAS_YUY2TOUVROW_SSE2
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15)
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGBTOUVROW_NEON
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_NEON
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_NEON
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_NEON
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_NEON
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
#endif
#ifdef HAS_RGB24TOUVROW_NEON
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_NEON
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
#endif
#ifdef HAS_RGB565TOUVROW_NEON
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
@ -360,11 +429,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif
#undef UVANY
#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
void NAMEANY(const uint8* src_uv, \
uint8* dst_u, uint8* dst_v, int width) { \
#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, SHIFT, MASK) \
void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
} \
ANYTOUV_C(src_uv + n * BPP, \
dst_u + (n >> SHIFT), \
dst_v + (n >> SHIFT), \
@ -372,42 +442,45 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
}
#ifdef HAS_ARGBTOUV444ROW_SSSE3
UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
ARGBToUV444Row_C, 4, 15, 0)
UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3,
ARGBToUV444Row_C, 4, 0, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_AVX2
UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
YUY2ToUV422Row_C, 2, 31, 1)
YUY2ToUV422Row_C, 2, 1, 31)
UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
UYVYToUV422Row_C, 2, 31, 1)
UYVYToUV422Row_C, 2, 1, 31)
#endif
#ifdef HAS_ARGBTOUVROW_SSSE3
UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
ARGBToUV422Row_C, 4, 15, 1)
UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
YUY2ToUV422Row_C, 2, 15, 1)
UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
UYVYToUV422Row_C, 2, 15, 1)
#ifdef HAS_ARGBTOUV422ROW_SSSE3
UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3,
ARGBToUV422Row_C, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_SSE2
UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2,
YUY2ToUV422Row_C, 2, 1, 15)
UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2,
UYVYToUV422Row_C, 2, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
ARGBToUV444Row_C, 4, 7, 0)
ARGBToUV444Row_C, 4, 0, 7)
UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
ARGBToUV422Row_C, 4, 15, 1)
ARGBToUV422Row_C, 4, 1, 15)
UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
ARGBToUV411Row_C, 4, 31, 2)
ARGBToUV411Row_C, 4, 2, 31)
UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
YUY2ToUV422Row_C, 2, 15, 1)
YUY2ToUV422Row_C, 2, 1, 15)
UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
UYVYToUV422Row_C, 2, 15, 1)
UYVYToUV422Row_C, 2, 1, 15)
#endif
#undef UV422ANY
#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
void NAMEANY(const uint8* src_uv, \
uint8* dst_u, uint8* dst_v, int width) { \
void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
} \
ANYTOUV_C(src_uv + n * 2, \
dst_u + n, \
dst_v + n, \
@ -415,7 +488,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
}
#ifdef HAS_SPLITUVROW_SSE2
SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_AVX2
SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
@ -424,7 +497,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
#endif
#ifdef HAS_SPLITUVROW_MIPS_DSPR2
SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2,
SplitUVRow_C, 15)
#endif
#undef SPLITUVROWANY
@ -433,7 +506,9 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
void NAMEANY(const uint8* src_u, const uint8* src_v, \
uint8* dst_uv, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
} \
ANYTOUV_C(src_u + n, \
src_v + n, \
dst_uv + n * 2, \
@ -441,7 +516,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
}
#ifdef HAS_MERGEUVROW_SSE2
MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
@ -455,7 +530,9 @@ MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
uint8* dst_argb, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
} \
ARGBMATH_C(src_argb0 + n * 4, \
src_argb1 + n * 4, \
dst_argb + n * 4, \
@ -502,7 +579,9 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
const uint8* shuffler, int width) { \
int n = width & ~MASK; \
if (n > 0) { \
ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
} \
ARGBTOY_C(src_argb + n * SBPP, \
dst_argb + n * BPP, shuffler, width & MASK); \
}
@ -512,7 +591,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
ARGBShuffleRow_C, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3,
ARGBShuffleRow_C, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
@ -531,35 +610,107 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
ptrdiff_t src_stride_ptr, int width, \
int source_y_fraction) { \
int n = width & ~MASK; \
TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
n, source_y_fraction); \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
} \
TERP_C(dst_ptr + n * BPP, \
src_ptr + n * SBPP, src_stride_ptr, \
width & MASK, source_y_fraction); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
InterpolateRow_C, 1, 1, 32)
NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31)
#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
InterpolateRow_C, 1, 1, 15)
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_SSE2
NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
InterpolateRow_C, 1, 1, 15)
NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON
NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
InterpolateRow_C, 1, 1, 15)
NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
InterpolateRow_C, 1, 1, 3)
NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
1, 1, 3)
#endif
#undef NANY
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
MIRROR_SIMD(src_y, dst_y + r * BPP, n); \
} \
MIRROR_C(src_y + n * BPP, dst_y, r); \
}
#ifdef HAS_MIRRORROW_AVX2
MANY(MirrorRow_Any_AVX2, MirrorRow_AVX2, MirrorRow_C, 1, 31)
#endif
#ifdef HAS_MIRRORROW_SSSE3
MANY(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, MirrorRow_C, 1, 15)
#endif
#ifdef HAS_MIRRORROW_SSE2
MANY(MirrorRow_Any_SSE2, MirrorRow_SSE2, MirrorRow_C, 1, 15)
#endif
#ifdef HAS_MIRRORROW_NEON
MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15)
#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_SSE2
MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
#endif
#undef MANY
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
COPY_SIMD(src_y, dst_y, n); \
} \
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
}
#ifdef HAS_COPYROW_AVX
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \
void NAMEANY(uint8* dst_y, T v8, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
SET_SIMD(dst_y, v8, n); \
} \
SET_C(dst_y + n * BPP, v8, r); \
}
#ifdef HAS_SETROW_X86
SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
#endif
#undef SETANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
int dither0 = dither8x8[x & 7] - 128;
int dither1 = dither8x8[(x & 7) + 1] - 128;
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
(b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
int dither0 = dither8x8[(width - 1) & 7] - 128;
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
*(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
}
}
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@ -385,6 +411,28 @@ void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ
void ARGBToUVJ422Row_C(const uint8* src_argb,
uint8* dst_u, uint8* dst_v, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
dst_u[0] = RGBToUJ(ar, ag, ab);
dst_v[0] = RGBToVJ(ar, ag, ab);
src_argb += 8;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8 ab = src_argb[0];
uint8 ag = src_argb[1];
uint8 ar = src_argb[2];
dst_u[0] = RGBToUJ(ar, ag, ab);
dst_v[0] = RGBToVJ(ar, ag, ab);
}
}
void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
@ -938,33 +986,52 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
}
}
// YUV to RGB conversion constants.
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR (VR * 128 - YGB)
// C reference code that mimics the YUV assembly.
#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
#define UB 127 /* min(63,(int8)(2.018 * 64)) */
#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
#define UR 0
#define VB 0
#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
// Bias
#define BB UB * 128 + VB * 128
#define BG UG * 128 + VG * 128
#define BR UR * 128 + VR * 128
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
int32 y1 = ((int32)(y) - 16) * YG;
*b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
*g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
*r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(BB - ( u * UB) + y1) >> 6);
*g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
*r = Clamp((int32)(BR - (v * VR ) + y1) >> 6);
}
// C reference code that mimics the YUV assembly.
static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(y1 - YGB) >> 6);
*g = Clamp((int32)(y1 - YGB) >> 6);
*r = Clamp((int32)(y1 - YGB) >> 6);
}
#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
// TODO(fbarchard): Remove subsampling from Neon.
void I444ToARGBRow_C(const uint8* src_y,
@ -1008,6 +1075,7 @@ void I444ToARGBRow_C(const uint8* src_y,
}
}
#endif
// Also used for 420
void I422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
@ -1034,6 +1102,59 @@ void I422ToARGBRow_C(const uint8* src_y,
}
}
// C reference code that mimics the YUV assembly.
// * R = Y + 1.40200 * Cr
// * G = Y - 0.34414 * Cb - 0.71414 * Cr
// * B = Y + 1.77200 * Cb
#define YGJ 64 /* (int8)round(1.000 * 64) */
#define UBJ 113 /* (int8)round(1.772 * 64) */
#define UGJ -22 /* (int8)round(-0.34414 * 64) */
#define URJ 0
#define VBJ 0
#define VGJ -46 /* (int8)round(-0.71414 * 64) */
#define VRJ 90 /* (int8)round(1.402 * 64) */
// Bias
#define BBJ (UBJ * 128 + VBJ * 128)
#define BGJ (UGJ * 128 + VGJ * 128)
#define BRJ (URJ * 128 + VRJ * 128)
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * YGJ);
*b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
*g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
*r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
}
void J422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvJPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvJPixel(src_y[1], src_u[0], src_v[0],
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvJPixel(src_y[0], src_u[0], src_v[0],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
void I422ToRGB24Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1470,18 +1591,15 @@ void I422ToRGBARow_C(const uint8* src_y,
void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], 128, 128,
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], 128, 128,
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], 128, 128,
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
}
@ -1569,29 +1687,16 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
memcpy(dst, src, count * 2);
}
void SetRow_C(uint8* dst, uint32 v8, int count) {
#ifdef _MSC_VER
// VC will generate rep stosb.
int x;
for (x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
void SetRow_C(uint8* dst, uint8 v8, int width) {
memset(dst, v8, width);
}
void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
int y;
for (y = 0; y < height; ++y) {
uint32* d = (uint32*)(dst);
void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
uint32* d = (uint32*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
d[x] = v32;
}
dst += dst_stride;
}
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420).
@ -1885,8 +1990,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
}
}
// Blend 2 rows into 1 for conversions such as I422ToI420.
void HalfRow_C(const uint8* src_uv, int src_uv_stride,
// Blend 2 rows into 1.
static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
int x;
for (x = 0; x < pix; ++x) {
@ -1894,7 +1999,7 @@ void HalfRow_C(const uint8* src_uv, int src_uv_stride,
}
}
void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
uint16* dst_uv, int pix) {
int x;
for (x = 0; x < pix; ++x) {
@ -1957,24 +2062,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
void ARGBToBayerRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
int index0 = selector & 0xff;
int index1 = (selector >> 8) & 0xff;
// Copy a row of Bayer.
int x;
for (x = 0; x < pix - 1; x += 2) {
dst_bayer[0] = src_argb[index0];
dst_bayer[1] = src_argb[index1];
src_argb += 8;
dst_bayer += 2;
}
if (pix & 1) {
dst_bayer[0] = src_argb[index0];
}
}
// Select G channel from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
@ -2061,122 +2148,272 @@ void I422ToUYVYRow_C(const uint8* src_y,
}
}
#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048
#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
uint8* dst_rgb565,
int width) {
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
ARGBToRGB565Row_SSE2(row, rgb_buf, width);
free_aligned_buffer_64(row);
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
#endif
#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#if defined(HAS_I422TOARGB1555ROW_SSSE3)
void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
uint8* dst_argb1555,
int width) {
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
free_aligned_buffer_64(row);
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_argb1555 += twidth * 2;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TOARGB4444ROW_SSSE3)
void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* rgb_buf,
uint8* dst_argb4444,
int width) {
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
free_aligned_buffer_64(row);
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_argb4444 += twidth * 2;
width -= twidth;
}
}
#endif
void NV12ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_uv,
#if defined(HAS_NV12TORGB565ROW_SSSE3)
void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
src_y += twidth;
src_uv += twidth;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif
#if defined(HAS_NV21TORGB565ROW_SSSE3)
void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
src_y += twidth;
src_vu += twidth;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif
#if defined(HAS_YUY2TOARGBROW_SSSE3)
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
src_yuy2 += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif
#if defined(HAS_UYVYTOARGBROW_SSSE3)
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
src_uyvy += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif // !defined(LIBYUV_DISABLE_X86)
#if defined(HAS_I422TORGB565ROW_AVX2)
void I422ToRGB565Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
int width) {
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
free_aligned_buffer_64(row);
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif
void NV21ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_rgb565,
#if defined(HAS_I422TOARGB1555ROW_AVX2)
void I422ToARGB1555Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
int width) {
// Allocate a row of ARGB.
align_buffer_64(row, width * 4);
NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
free_aligned_buffer_64(row);
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_argb1555 += twidth * 2;
width -= twidth;
}
}
#endif
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
#if defined(HAS_I422TOARGB4444ROW_AVX2)
void I422ToARGB4444Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
int width) {
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8* row_u = row_y + ((width + 63) & ~63);
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
YUY2ToYRow_SSE2(src_yuy2, row_y, width);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
free_aligned_buffer_64(row_y);
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_argb4444 += twidth * 2;
width -= twidth;
}
}
#endif
void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8* row_u = row_y + ((width + 63) & ~63);
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
free_aligned_buffer_64(row_y);
#if defined(HAS_NV12TORGB565ROW_AVX2)
void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
src_y += twidth;
src_uv += twidth;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8* row_u = row_y + ((width + 63) & ~63);
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
UYVYToYRow_SSE2(src_uyvy, row_y, width);
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
free_aligned_buffer_64(row_y);
#if defined(HAS_NV21TORGB565ROW_AVX2)
void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
uint8* dst_rgb565, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
src_y += twidth;
src_vu += twidth;
dst_rgb565 += twidth * 2;
width -= twidth;
}
}
#endif
void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
uint8* row_u = row_y + ((width + 63) & ~63);
uint8* row_v = row_u + ((width + 63) & ~63) / 2;
UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
free_aligned_buffer_64(row_y);
#if defined(HAS_YUY2TOARGBROW_AVX2)
void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
src_yuy2 += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#if defined(HAS_UYVYTOARGBROW_AVX2)
void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
// Row buffers for intermediate YUV pixels.
SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
src_uyvy += twidth * 2;
dst_argb += twidth * 4;
width -= twidth;
}
}
#endif // !defined(LIBYUV_DISABLE_X86)
void ARGBPolynomialRow_C(const uint8* src_argb,

View File

@ -378,7 +378,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
(__mips_dsp_rev >= 2) && \
(_MIPS_SIM == _MIPS_SIM_ABI32)
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
);
}
void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
uint8* dst_v, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
"srl $t4, %[width], 4 \n" // multiplies of 16
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lwr $t0, 0(%[src_uv]) \n"
"lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
"lwr $t1, 4(%[src_uv]) \n"
"lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
"lwr $t2, 8(%[src_uv]) \n"
"lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
"lwr $t3, 12(%[src_uv]) \n"
"lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
"lwr $t5, 16(%[src_uv]) \n"
"lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
"lwr $t6, 20(%[src_uv]) \n"
"lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
"lwr $t7, 24(%[src_uv]) \n"
"lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
"lwr $t8, 28(%[src_uv]) \n"
"lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
"precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
"precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
"precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
"precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
"precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
"precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
"precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
"precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
"addiu %[src_uv], %[src_uv], 32 \n"
"swr $t9, 0(%[dst_v]) \n"
"swl $t9, 3(%[dst_v]) \n"
"swr $t0, 0(%[dst_u]) \n"
"swl $t0, 3(%[dst_u]) \n"
"swr $t1, 4(%[dst_v]) \n"
"swl $t1, 7(%[dst_v]) \n"
"swr $t2, 4(%[dst_u]) \n"
"swl $t2, 7(%[dst_u]) \n"
"swr $t3, 8(%[dst_v]) \n"
"swl $t3, 11(%[dst_v]) \n"
"swr $t5, 8(%[dst_u]) \n"
"swl $t5, 11(%[dst_u]) \n"
"swr $t6, 12(%[dst_v]) \n"
"swl $t6, 15(%[dst_v]) \n"
"swr $t7, 12(%[dst_u]) \n"
"swl $t7, 15(%[dst_u]) \n"
"addiu %[dst_u], %[dst_u], 16 \n"
"bgtz $t4, 1b \n"
" addiu %[dst_v], %[dst_v], 16 \n"
"beqz %[width], 3f \n"
" nop \n"
"2: \n"
"lbu $t0, 0(%[src_uv]) \n"
"lbu $t1, 1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], 2 \n"
"addiu %[width], %[width], -1 \n"
"sb $t0, 0(%[dst_u]) \n"
"sb $t1, 0(%[dst_v]) \n"
"addiu %[dst_u], %[dst_u], 1 \n"
"bgtz %[width], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
"3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[width] "+r" (width),
[dst_u] "+r" (dst_u),
[dst_v] "+r" (dst_v)
:
: "t0", "t1", "t2", "t3",
"t4", "t5", "t6", "t7", "t8", "t9"
);
}
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
__asm__ __volatile__ (
".set push \n"
@ -927,7 +844,7 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
int y0_fraction = 256 - source_y_fraction;

View File

@ -16,7 +16,8 @@ extern "C" {
#endif
// This module is for GCC Neon
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
@ -92,36 +93,73 @@ extern "C" {
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
#define YUV422TORGB \
"veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
"vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
"vmull.s8 q9, d2, d25 \n"/* u/v G component */\
"vmov.u8 d1, #0 \n"/* split odd/even y apart */\
"vtrn.u8 d0, d1 \n" \
"vsub.s16 q0, q0, q15 \n"/* offset y */\
"vmul.s16 q0, q0, q14 \n" \
"vadd.s16 d18, d19 \n" \
"vqadd.s16 d20, d0, d16 \n" /* B */ \
"vqadd.s16 d21, d1, d16 \n" \
"vqadd.s16 d22, d0, d17 \n" /* R */ \
"vqadd.s16 d23, d1, d17 \n" \
"vqadd.s16 d16, d0, d18 \n" /* G */ \
"vqadd.s16 d17, d1, d18 \n" \
"vqshrun.s16 d0, q10, #6 \n" /* B */ \
"vqshrun.s16 d1, q11, #6 \n" /* G */ \
"vqshrun.s16 d2, q8, #6 \n" /* R */ \
"vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
"vmovl.u8 q11, d1 \n" \
"vmovl.u8 q8, d2 \n" \
"vtrn.u8 d20, d21 \n" \
"vtrn.u8 d22, d23 \n" \
"vtrn.u8 d16, d17 \n" \
"vmov.u8 d21, d16 \n"
#define YUV422TORGB_SETUP_REG \
"vld1.8 {d24}, [%[kUVToRB]] \n" \
"vld1.8 {d25}, [%[kUVToG]] \n" \
"vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
"vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
"vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
#define YUV422TORGB \
"vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\
"vmull.u8 q9, d2, d25 \n" /* u/v G component */\
"vmovl.u8 q0, d0 \n" /* Y */\
"vmovl.s16 q10, d1 \n" \
"vmovl.s16 q0, d0 \n" \
"vmul.s32 q10, q10, q15 \n" \
"vmul.s32 q0, q0, q15 \n" \
"vqshrun.s32 d0, q0, #16 \n" \
"vqshrun.s32 d1, q10, #16 \n" /* Y */\
"vadd.s16 d18, d19 \n" \
"vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\
"vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\
"vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\
"vaddw.u16 q1, q1, d16 \n" \
"vaddw.u16 q10, q10, d17 \n" \
"vaddw.u16 q3, q3, d18 \n" \
"vqadd.s16 q8, q0, q13 \n" /* B */ \
"vqadd.s16 q9, q0, q14 \n" /* R */ \
"vqadd.s16 q0, q0, q4 \n" /* G */ \
"vqadd.s16 q8, q8, q1 \n" /* B */ \
"vqadd.s16 q9, q9, q10 \n" /* R */ \
"vqsub.s16 q0, q0, q3 \n" /* G */ \
"vqshrun.s16 d20, q8, #6 \n" /* B */ \
"vqshrun.s16 d22, q9, #6 \n" /* R */ \
"vqshrun.s16 d21, q0, #6 \n" /* G */
// YUV to RGB conversion constants.
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR (VR * 128 - YGB)
static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102,
0, 0, 0, 0, 0, 0, 0, 0 };
static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
0, 0, 0, 0, 0, 0, 0, 0 };
static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@ -129,13 +167,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV444
@ -150,8 +182,10 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -163,13 +197,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -184,8 +212,10 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -197,13 +227,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV411
@ -218,8 +242,10 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -231,13 +257,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -253,8 +273,10 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_bgra), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -266,13 +288,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -288,8 +304,10 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_abgr), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -301,13 +319,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -322,8 +334,10 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -335,13 +349,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -355,8 +363,10 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb24), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -368,13 +378,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -389,8 +393,10 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_raw), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -414,13 +420,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -435,8 +435,10 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -463,13 +465,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV422
@ -485,8 +481,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -507,13 +505,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n"
@ -530,8 +522,10 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
: "r"(&kUVToRB), // %5
"r"(&kUVToG) // %6
: [kUVToRB]"r"(&kUVToRB), // %5
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -541,13 +535,7 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUV400
@ -560,8 +548,10 @@ void YToARGBRow_NEON(const uint8* src_y,
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kUVToRB), // %3
"r"(&kUVToG) // %4
: [kUVToRB]"r"(&kUVToRB), // %3
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -595,13 +585,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV12
@ -615,8 +599,10 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: [kUVToRB]"r"(&kUVToRB), // %4
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -627,13 +613,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV21
@ -647,8 +627,10 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: [kUVToRB]"r"(&kUVToRB), // %4
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -659,13 +641,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV12
@ -679,8 +655,10 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: [kUVToRB]"r"(&kUVToRB), // %4
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -691,13 +669,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READNV21
@ -711,8 +683,10 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
: "r"(&kUVToRB), // %4
"r"(&kUVToG) // %5
: [kUVToRB]"r"(&kUVToRB), // %4
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -722,13 +696,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READYUY2
@ -741,8 +709,10 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kUVToRB), // %3
"r"(&kUVToG) // %4
: [kUVToRB]"r"(&kUVToRB), // %3
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -752,13 +722,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
YUV422TORGB_SETUP_REG
".p2align 2 \n"
"1: \n"
READUYVY
@ -771,8 +735,10 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(&kUVToRB), // %3
"r"(&kUVToG) // %4
: [kUVToRB]"r"(&kUVToRB), // %3
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
@ -844,12 +810,28 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
);
}
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v32, int count) {
// SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
"vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v8) // %2
: "cc", "memory", "q0"
);
}
// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"subs %1, %1, #4 \n" // 4 pixels per loop
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
@ -860,16 +842,6 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
);
}
// TODO(fbarchard): Make fully assembler
// SetRow32 writes 'count' words using a 32 bit value repeated.
void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
int dst_stride, int height) {
for (int y = 0; y < height; ++y) {
SetRow_NEON(dst, v32, width << 2);
dst += dst_stride;
}
}
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile (
// Start at end of source row.
@ -1273,53 +1245,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
);
}
void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2
"+r"(pix) // %3
:
: "cc", "memory", "q0", "q1" // Clobber List
);
}
// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
: "r"(selector) // %3
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
// Select G channels from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
@ -2832,7 +2757,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
"vmovl.u8 q9, d18 \n" // g
"vmovl.u8 q10, d20 \n" // r
"vmovl.u8 q15, d22 \n" // a
"vmovl.u8 q11, d22 \n" // a
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
@ -2853,10 +2778,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
"vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
"vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
"vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
"vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
"vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
"vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
"vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
@ -2872,7 +2797,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"(matrix_argb) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
: "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -3140,7 +3065,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "q0", "q1" // Clobber List
);
}
#endif // __ARM_NEON__
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -57,20 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
}
#elif defined(HAS_SCALEROWDOWN2_SSE2)
#endif
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
ScaleRowDown2Box_Unaligned_SSE2);
if (IS_ALIGNED(src_ptr, 16) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
}
}
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -112,21 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
ScaleRowDown2_16_NEON;
}
#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
#endif
#if defined(HAS_SCALEROWDOWN2_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ?
ScaleRowDown2_Unaligned_16_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
ScaleRowDown2Box_Unaligned_16_SSE2);
if (IS_ALIGNED(src_ptr, 16) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
ScaleRowDown2Box_16_SSE2);
}
}
#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -168,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height,
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
#elif defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -212,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
ScaleRowDown4_16_NEON;
}
#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_SCALEROWDOWN4_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
ScaleRowDown4_16_SSE2;
}
#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -271,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
@ -351,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
}
#endif
#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
@ -445,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
}
}
#elif defined(HAS_SCALEROWDOWN38_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_SCALEROWDOWN38_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@ -456,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
}
}
#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -522,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
}
}
#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
#endif
#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
@ -533,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
}
}
#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
#endif
#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@ -758,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
IS_ALIGNED(src_width, 16) &&
&& IS_ALIGNED(src_width, 16)
#endif
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
) {
ScaleAddRows = ScaleAddRows_SSE2;
}
#endif
@ -830,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
IS_ALIGNED(src_width, 16) &&
&& IS_ALIGNED(src_width, 16)
#endif
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
) {
ScaleAddRows = ScaleAddRows_16_SSE2;
}
#endif
@ -886,29 +875,23 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -916,7 +899,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
@ -924,7 +907,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
@ -988,29 +971,23 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(src_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@ -1018,7 +995,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
@ -1026,7 +1003,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(src_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@ -1087,29 +1064,23 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -1117,7 +1088,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
@ -1125,7 +1096,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
@ -1144,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
@ -1226,29 +1195,23 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(dst_width, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@ -1256,7 +1219,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
@ -1264,7 +1227,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@ -1283,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
@ -1366,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
@ -1401,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleCols = ScaleColsUp2_16_C;
#if defined(HAS_SCALECOLS_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif

View File

@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
}
@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
#endif
@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 4)) {
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
@ -190,29 +185,23 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
src_argb += xl * 4;
x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(clip_src_width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -220,15 +209,15 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
if (IS_ALIGNED(clip_src_width, 4)) {
@ -286,29 +275,23 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@ -316,15 +299,15 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
@ -346,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@ -427,18 +408,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(src_width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
@ -446,7 +424,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(src_width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
@ -467,29 +445,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width, 8)) {
InterpolateRow = InterpolateRow_AVX2;
@ -497,15 +469,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
InterpolateRow = InterpolateRow_MIPS_DSPR2;
}
@ -531,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
@ -640,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif

View File

@ -885,31 +885,23 @@ void ScalePlaneVertical(int src_height,
assert(dst_height > 0);
src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSE2;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_AVX2;
@ -917,15 +909,15 @@ void ScalePlaneVertical(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
@ -967,31 +959,23 @@ void ScalePlaneVertical_16(int src_height,
assert(dst_height > 0);
src_argb += (x >> 16) * wpp;
#if defined(HAS_INTERPOLATEROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_16_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSE2;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_16_SSSE3;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
InterpolateRow = InterpolateRow_16_SSSE3;
}
}
}
#endif
#if defined(HAS_INTERPOLATEROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_16_AVX2;
if (IS_ALIGNED(dst_width_bytes, 32)) {
InterpolateRow = InterpolateRow_16_AVX2;
@ -999,15 +983,15 @@ void ScalePlaneVertical_16(int src_height,
}
#endif
#if defined(HAS_INTERPOLATEROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_16_NEON;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_16_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

View File

@ -16,7 +16,8 @@ extern "C" {
#endif
// This module is for GCC Neon.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
// NEON downscalers with interpolation.
// Provided by Fritz Koenig
@ -756,7 +757,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
#endif // __ARM_NEON__
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus
} // extern "C"

File diff suppressed because it is too large Load Diff

View File

@ -101,24 +101,20 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
:: "memory", "cc", "xmm0", "xmm1"
);
}
@ -130,8 +126,8 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
@ -142,18 +138,14 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
@ -163,118 +155,11 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
);
}
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"movdqa %%xmm1,%%xmm3 \n"
"psrlw $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm2 \n"
"pand %%xmm5,%%xmm3 \n"
"pavgw %%xmm2,%%xmm0 \n"
"pavgw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
);
}
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
BUNDLEALIGN
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
@ -296,13 +181,8 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
@ -315,8 +195,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
@ -330,11 +210,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
@ -348,18 +224,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm4,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
@ -388,13 +262,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(dst_width), // %2
"+r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
);
}
@ -412,8 +281,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm2,%%xmm1 \n"
"palignr $0x8,%%xmm0,%%xmm1 \n"
@ -429,11 +298,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
:: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@ -461,8 +326,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm2,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
@ -479,9 +344,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm4,%%xmm6 \n"
@ -498,13 +362,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -533,8 +392,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
"pavgb %%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
"pshufb %%xmm2,%%xmm6 \n"
@ -553,8 +412,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
"psrlw $0x2,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm6,%%xmm7 \n"
"pavgb %%xmm7,%%xmm6 \n"
@ -572,13 +431,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -590,8 +444,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
@ -607,10 +461,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(dst_width) // %2
: "m"(kShuf38a), // %3
"m"(kShuf38b) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm4", "xmm5"
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
);
}
@ -631,9 +482,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pshufb %%xmm2,%%xmm1 \n"
"movdqa %%xmm0,%%xmm6 \n"
@ -643,23 +495,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
"sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@ -679,8 +526,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
"movhlps %%xmm0,%%xmm1 \n"
"movhlps %%xmm6,%%xmm7 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
@ -689,7 +536,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"punpcklbw %%xmm5,%%xmm7 \n"
"paddusw %%xmm6,%%xmm0 \n"
"paddusw %%xmm7,%%xmm1 \n"
MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
"lea " MEMLEA(0x10,0) ",%0 \n"
"movhlps %%xmm6,%%xmm7 \n"
"punpcklbw %%xmm5,%%xmm6 \n"
@ -711,23 +558,18 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
@ -741,7 +583,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
@ -753,7 +595,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"2: \n"
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
@ -765,8 +607,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
LABELALIGN
"3: \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x10,%4 \n"
@ -778,10 +620,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
"+r"(src_width), // %4
"+rm"(src_height) // %5
: "rm"((intptr_t)(src_stride)) // %6
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
@ -813,7 +652,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
"movd %k2,%%xmm0 \n"
"psrlw $0x9,%%xmm1 \n"
BUNDLEALIGN
MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
"movd %k2,%%xmm4 \n"
"pshufb %%xmm5,%%xmm1 \n"
@ -853,13 +691,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+rm"(dst_width) // %5
: "rm"(x), // %6
"rm"(dx) // %7
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
@ -870,25 +703,21 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(1) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
:: "memory", "cc", "xmm0", "xmm1"
);
}
@ -898,22 +727,18 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
:: "memory", "cc", "xmm0", "xmm1"
);
}
@ -923,25 +748,21 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
:: "memory", "cc", "xmm0", "xmm1"
);
}
@ -951,11 +772,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@ -963,21 +783,16 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
}
@ -996,29 +811,22 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"movd " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
"punpckldq %%xmm1,%%xmm0 \n"
BUNDLEALIGN
MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
"+r"(dst_argb), // %2
"+r"(dst_width), // %3
"+r"(src_stepx_x12) // %4
:
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
}
@ -1040,11 +848,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"movq " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
BUNDLEALIGN
MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"movq " MEMACCESS(5) ",%%xmm2 \n"
BUNDLEALIGN
MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
@ -1055,9 +861,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1
@ -1065,14 +871,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"+rm"(dst_width), // %3
"+r"(src_stepx_x12), // %4
"+r"(row1) // %5
:
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
:: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3"
);
}
@ -1111,15 +911,14 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n"
"49: \n"
"test $0x2,%4 \n"
"je 29f \n"
BUNDLEALIGN
MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
"pextrw $0x5,%%xmm2,%k0 \n"
@ -1139,13 +938,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"+r"(dst_width) // %4
: "rm"(x), // %5
"rm"(dx) // %6
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
@ -1156,28 +950,22 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(1) ",%%xmm0 \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
:: "memory", "cc", NACL_R14
"xmm0", "xmm1"
);
}
@ -1225,7 +1013,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"paddd %%xmm3,%%xmm2 \n"
MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
"psrlw $0x9,%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
"pshufb %%xmm5,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
@ -1245,7 +1032,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"add $0x1,%2 \n"
"jl 99f \n"
"psrlw $0x9,%%xmm2 \n"
BUNDLEALIGN
MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
"pshufb %%xmm5,%%xmm2 \n"
"pshufb %%xmm4,%%xmm0 \n"
@ -1264,13 +1050,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
"+r"(x1) // %4
: "rm"(x), // %5
"rm"(dx) // %6
: "memory", "cc"
#if defined(__native_client__) && defined(__x86_64__)
, "r14"
#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}

View File

@ -103,17 +103,16 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
@ -133,10 +132,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
@ -149,9 +147,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
ret
@ -172,120 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqa [edx], xmm0
lea edx, [edx + 16]
jg wloop
pop esi
ret
}
}
// Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
ret
}
}
// Blends 32x1 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
psrlw xmm0, 8
movdqa xmm3, xmm1
psrlw xmm1, 8
pand xmm2, xmm5
pand xmm3, xmm5
pavgw xmm0, xmm2
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
jg wloop
ret
}
}
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@ -305,9 +189,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
pavgw xmm1, xmm3
packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
pop esi
@ -329,19 +213,18 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrld xmm5, 24
pslld xmm5, 16
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
pand xmm0, xmm5
pand xmm1, xmm5
packuswb xmm0, xmm1
psrlw xmm0, 8
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
ret
@ -364,18 +247,17 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
psrlw xmm7, 8
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
movdqa xmm2, [eax + esi * 2]
movdqa xmm3, [eax + esi * 2 + 16]
movdqa xmm4, [eax + edi]
movdqa xmm5, [eax + edi + 16]
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
movdqu xmm4, [eax + edi]
movdqu xmm5, [eax + edi + 16]
lea eax, [eax + 32]
pavgb xmm2, xmm4
pavgb xmm3, xmm5
@ -398,9 +280,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0
lea edx, [edx + 8]
sub ecx, 8
jg wloop
pop edi
@ -427,10 +309,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf1
movdqa xmm5, kShuf2
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm1
palignr xmm1, xmm0, 8
@ -481,10 +362,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
align 4
wloop:
movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi]
movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi]
pavgb xmm0, xmm1
pshufb xmm0, xmm2
pmaddubsw xmm0, xmm5
@ -501,8 +381,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
movdqa xmm0, [eax + 16] // pixels 16..23
movdqa xmm1, [eax + esi + 16]
movdqu xmm0, [eax + 16] // pixels 16..23
movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm1
pshufb xmm0, xmm4
@ -511,9 +391,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24]
sub ecx, 24
jg wloop
pop esi
@ -540,10 +420,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
movdqa xmm6, kMadd11
movdqa xmm7, kRound34
align 4
wloop:
movdqa xmm0, [eax] // pixels 0..7
movdqa xmm1, [eax + esi]
movdqu xmm0, [eax] // pixels 0..7
movdqu xmm1, [eax + esi]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
pshufb xmm0, xmm2
@ -562,8 +441,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
psrlw xmm0, 2
packuswb xmm0, xmm0
movq qword ptr [edx + 8], xmm0
movdqa xmm0, [eax + 16] // pixels 16..23
movdqa xmm1, [eax + esi + 16]
movdqu xmm0, [eax + 16] // pixels 16..23
movdqu xmm1, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm1, xmm0
pavgb xmm0, xmm1
@ -573,9 +452,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7
psrlw xmm0, 2
packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0
lea edx, [edx+24]
sub ecx, 24
jg wloop
pop esi
@ -597,20 +476,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
movdqa xmm4, kShuf38a
movdqa xmm5, kShuf38b
align 4
xloop:
movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
lea eax, [eax + 32]
pshufb xmm0, xmm4
pshufb xmm1, xmm5
paddusb xmm0, xmm1
sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0
movd [edx + 8], xmm1
lea edx, [edx + 12]
sub ecx, 12
jg xloop
ret
@ -633,10 +511,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kScaleAc33
pxor xmm5, xmm5
align 4
xloop:
movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
movdqa xmm6, [eax + esi]
movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
movdqu xmm6, [eax + esi]
movhlps xmm1, xmm0
movhlps xmm7, xmm6
punpcklbw xmm0, xmm5
@ -645,7 +522,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
punpcklbw xmm7, xmm5
paddusw xmm0, xmm6
paddusw xmm1, xmm7
movdqa xmm6, [eax + esi * 2]
movdqu xmm6, [eax + esi * 2]
lea eax, [eax + 16]
movhlps xmm7, xmm6
punpcklbw xmm6, xmm5
@ -671,11 +548,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6
sub ecx, 6
movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16
movd [edx + 2], xmm6
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
@ -699,11 +576,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
movdqa xmm4, kShufAb2
movdqa xmm5, kScaleAb2
align 4
xloop:
movdqa xmm0, [eax] // average 2 rows into xmm0
pavgb xmm0, [eax + esi]
movdqu xmm0, [eax] // average 2 rows into xmm0
movdqu xmm1, [eax + esi]
lea eax, [eax + 16]
pavgb xmm0, xmm1
movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
pshufb xmm1, xmm2
@ -716,11 +593,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1
sub ecx, 6
movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16
movd [edx + 2], xmm1
lea edx, [edx + 6]
sub ecx, 6
jg xloop
pop esi
@ -747,10 +624,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pxor xmm4, xmm4
dec ebx
align 4
xloop:
// first row
movdqa xmm0, [esi]
movdqu xmm0, [esi]
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
@ -761,9 +637,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
je ydone
// sum remaining rows
align 4
yloop:
movdqa xmm2, [eax] // read 16 pixels
movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
@ -773,10 +648,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
sub ebp, 1
jg yloop
align 4
ydone:
movdqa [edi], xmm0
movdqa [edi + 16], xmm1
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32]
sub ecx, 16
@ -828,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
align 4
xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx
@ -851,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
sub ecx, 2 // 2 pixels
jge xloop2
align 4
xloop29:
add ecx, 2 - 1
@ -869,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movd ebx, xmm0
mov [edi], bl
align 4
xloop99:
pop edi
@ -889,17 +760,16 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
mov eax, [esp + 8] // src_ptr
mov ecx, [esp + 12] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1
sub ecx, 32
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 32
jg wloop
ret
@ -918,15 +788,14 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
@ -945,18 +814,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
mov edx, [esp + 12] // dst_argb
mov ecx, [esp + 16] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
lea eax, [eax + 32]
movdqa xmm2, xmm0
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
ret
@ -976,12 +844,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2 // average rows
pavgb xmm1, xmm3
@ -989,9 +856,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop esi
@ -1016,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
align 4
wloop:
movd xmm0, [eax]
movd xmm1, [eax + ebx]
@ -1026,9 +892,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
@ -1057,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
lea ebx, [ebx * 4]
lea edi, [ebx + ebx * 2]
align 4
wloop:
movq xmm0, qword ptr [eax] // row0 4 pairs
movhps xmm0, qword ptr [eax + ebx]
@ -1075,9 +940,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2
sub ecx, 4
movdqa [edx], xmm0
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
jg wloop
pop edi
@ -1118,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
jl xloop49
// 4 Pixel loop.
align 4
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
@ -1133,12 +997,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0
lea edi, [edi + 16]
sub ecx, 4 // 4 pixels
jge xloop4
align 4
xloop49:
test ecx, 2
je xloop29
@ -1159,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
align 4
xloop99:
pop esi
@ -1209,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. preroll
// 2 Pixel loop.
align 4
xloop2:
movdqa xmm1, xmm2 // x0, x1 fractions.
paddd xmm2, xmm3 // x += dx
@ -1229,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
sub ecx, 2 // 2 pixels
jge xloop2
align 4
xloop29:
add ecx, 2 - 1
@ -1246,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
movd [edi], xmm0
align 4
xloop99:
pop edi
@ -1265,17 +1124,16 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
mov eax, [esp + 8] // src_argb
mov ecx, [esp + 12] // dst_width
align 4
wloop:
movdqa xmm0, [eax]
movdqu xmm0, [eax]
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1
sub ecx, 8
movdqa [edx], xmm0
movdqa [edx + 16], xmm1
movdqu [edx], xmm0
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 8
jg wloop
ret

View File

@ -33,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
{FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
{FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
{FOURCC_DMB1, FOURCC_MJPG},
{FOURCC_BA81, FOURCC_BGGR},
{FOURCC_BA81, FOURCC_BGGR}, // deprecated.
{FOURCC_RGB3, FOURCC_RAW },
{FOURCC_BGR3, FOURCC_24BG},
{FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB