libyuv: update to r1060

picks up some lint, build fixes

Change-Id: I0efb19385afa4ea3073a53e2b8334e57f245eea0
This commit is contained in:
James Zern 2014-08-22 10:31:01 -07:00
parent 812506b80c
commit b644eb9f44
15 changed files with 684 additions and 474 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1041 Version: 1060
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -153,7 +153,6 @@ class LIBYUV_API MJpegDecoder {
int* subsample_x, int* subsample_y, int number_of_components); int* subsample_x, int* subsample_y, int number_of_components);
private: private:
void AllocOutputBuffers(int num_outbufs); void AllocOutputBuffers(int num_outbufs);
void DestroyOutputBuffers(); void DestroyOutputBuffers();

View File

@ -252,6 +252,94 @@ extern "C" {
// The following are available on arm64 platforms: // The following are available on arm64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// #define HAS_I444TOARGBROW_NEON
// #define HAS_I422TOARGBROW_NEON
// #define HAS_I411TOARGBROW_NEON
// #define HAS_I422TOBGRAROW_NEON
// #define HAS_I422TOABGRROW_NEON
// #define HAS_I422TORGBAROW_NEON
// #define HAS_I422TORGB24ROW_NEON
// #define HAS_I422TORAWROW_NEON
// #define HAS_I422TORGB565ROW_NEON
// #define HAS_I422TOARGB1555ROW_NEON
// #define HAS_I422TOARGB4444ROW_NEON
// #define HAS_YTOARGBROW_NEON
// #define HAS_I400TOARGBROW_NEON
// #define HAS_NV12TOARGBROW_NEON
// #define HAS_NV21TOARGBROW_NEON
// #define HAS_NV12TORGB565ROW_NEON
// #define HAS_NV21TORGB565ROW_NEON
// #define HAS_YUY2TOARGBROW_NEON
// #define HAS_UYVYTOARGBROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_COPYROW_NEON
#define HAS_SETROW_NEON
#define HAS_ARGBSETROWS_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
#define HAS_ARGBMIRRORROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RAWTOARGBROW_NEON
// #define HAS_RGB565TOARGBROW_NEON
// #define HAS_ARGB1555TOARGBROW_NEON
// #define HAS_ARGB4444TOARGBROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_YUY2TOYROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_HALFROW_NEON
#define HAS_ARGBTOBAYERROW_NEON
#define HAS_ARGBTOBAYERGGROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#define HAS_I422TOYUY2ROW_NEON
#define HAS_I422TOUYVYROW_NEON
// #define HAS_ARGBTORGB565ROW_NEON
// #define HAS_ARGBTOARGB1555ROW_NEON
// #define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_ARGBTOYJROW_NEON
// #define HAS_ARGBTOUV444ROW_NEON
// #define HAS_ARGBTOUV422ROW_NEON
// #define HAS_ARGBTOUV411ROW_NEON
// #define HAS_ARGBTOUVROW_NEON
// #define HAS_ARGBTOUVJROW_NEON
// #define HAS_BGRATOUVROW_NEON
// #define HAS_ABGRTOUVROW_NEON
// #define HAS_RGBATOUVROW_NEON
// #define HAS_RGB24TOUVROW_NEON
// #define HAS_RAWTOUVROW_NEON
// #define HAS_RGB565TOUVROW_NEON
// #define HAS_ARGB1555TOUVROW_NEON
// #define HAS_ARGB4444TOUVROW_NEON
// #define HAS_RGB565TOYROW_NEON
// #define HAS_ARGB1555TOYROW_NEON
// #define HAS_ARGB4444TOYROW_NEON
// #define HAS_BGRATOYROW_NEON
// #define HAS_ABGRTOYROW_NEON
// #define HAS_RGBATOYROW_NEON
// #define HAS_RGB24TOYROW_NEON
// #define HAS_RAWTOYROW_NEON
// #define HAS_INTERPOLATEROW_NEON
// #define HAS_ARGBBLENDROW_NEON
// #define HAS_ARGBATTENUATEROW_NEON
// #define HAS_ARGBQUANTIZEROW_NEON
// #define HAS_ARGBSHADEROW_NEON
// #define HAS_ARGBGRAYROW_NEON
// #define HAS_ARGBSEPIAROW_NEON
// #define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_SOBELROW_NEON
#define HAS_SOBELTOPLANEROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELXROW_NEON
#define HAS_SOBELYROW_NEON
#endif #endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
@ -465,7 +553,7 @@ typedef uint8 uvec8[16];
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
#endif // defined(__native_client__) && defined(__x86_64__) #endif // defined(__native_client__) && defined(__x86_64__)
#if defined(__arm__) #if defined(__arm__) || defined(__aarch64__)
#undef MEMACCESS #undef MEMACCESS
#if defined(__native_client__) #if defined(__native_client__)
#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" #define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"

View File

@ -51,6 +51,14 @@ extern "C" {
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWN2_NEON
#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__aarch64__) || defined(LIBYUV_NEON))
/* #define HAS_SCALEROWDOWN2_NEON */
/* #define HAS_SCALEROWDOWN4_NEON */
/* #define HAS_SCALEROWDOWN34_NEON */
/* #define HAS_SCALEROWDOWN38_NEON */
/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
/* #define HAS_SCALEARGBROWDOWN2_NEON */
#endif #endif
// The following are available on Mips platforms: // The following are available on Mips platforms:

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1041 #define LIBYUV_VERSION 1059
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON #define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif #endif

View File

@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse; return sse;
} }
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"bgt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // __ARM_NEON__ #endif // __ARM_NEON__
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -401,7 +401,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
int y; int y;
int halfheight = (height + 1) >> 1; int halfheight;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) = YUY2ToUV422Row_C; int pix) = YUY2ToUV422Row_C;
@ -711,13 +711,15 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
@ -963,9 +965,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C; ARGBToYRow_C;
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif #endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v || if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -1022,6 +1021,13 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB24TOYROW_NEON #endif // HAS_RGB24TOYROW_NEON
{
#if !defined(HAS_RGB24TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RGB24TOYROW_NEON) #if defined(HAS_RGB24TOYROW_NEON)
RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
@ -1052,6 +1058,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
#if !defined(HAS_RGB24TOYROW_NEON) #if !defined(HAS_RGB24TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif #endif
}
return 0; return 0;
} }
@ -1075,9 +1082,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C; ARGBToYRow_C;
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif #endif
if (!src_raw || !dst_y || !dst_u || !dst_v || if (!src_raw || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -1134,6 +1138,11 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RAWTOYROW_NEON #endif // HAS_RAWTOYROW_NEON
{
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RAWTOYROW_NEON) #if defined(HAS_RAWTOYROW_NEON)
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
@ -1164,6 +1173,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
#if !defined(HAS_RAWTOYROW_NEON) #if !defined(HAS_RAWTOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif #endif
}
return 0; return 0;
} }
@ -1187,9 +1197,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C; ARGBToYRow_C;
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif #endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v || if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -1246,6 +1253,13 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB565TOYROW_NEON #endif // HAS_RGB565TOYROW_NEON
{
#if !defined(HAS_RGB565TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RGB565TOYROW_NEON) #if defined(HAS_RGB565TOYROW_NEON)
RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
@ -1276,6 +1290,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
#if !defined(HAS_RGB565TOYROW_NEON) #if !defined(HAS_RGB565TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif #endif
}
return 0; return 0;
} }
@ -1299,9 +1314,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C; ARGBToYRow_C;
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif #endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v || if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -1358,6 +1370,12 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB1555TOYROW_NEON #endif // HAS_ARGB1555TOYROW_NEON
{
#if !defined(HAS_ARGB1555TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB1555TOYROW_NEON) #if defined(HAS_ARGB1555TOYROW_NEON)
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
@ -1390,6 +1408,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#if !defined(HAS_ARGB1555TOYROW_NEON) #if !defined(HAS_ARGB1555TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif #endif
}
return 0; return 0;
} }
@ -1413,9 +1432,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C; ARGBToYRow_C;
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif #endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v || if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -1472,6 +1488,13 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB4444TOYROW_NEON #endif // HAS_ARGB4444TOYROW_NEON
{
#if !defined(HAS_ARGB4444TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB4444TOYROW_NEON) #if defined(HAS_ARGB4444TOYROW_NEON)
ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
@ -1504,6 +1527,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
#if !defined(HAS_ARGB4444TOYROW_NEON) #if !defined(HAS_ARGB4444TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif #endif
}
return 0; return 0;
} }

View File

@ -60,6 +60,13 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV444ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToUV444Row = ARGBToUV444Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@ -76,10 +83,8 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
#elif defined(HAS_ARGBTOYROW_NEON) #elif defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) { if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
ARGBToUV444Row = ARGBToUV444Row_NEON;
} }
} }
#endif #endif
@ -134,6 +139,13 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
@ -153,12 +165,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
@ -228,13 +234,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 32) { }
#endif
#if defined(HAS_ARGBTOUV411ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
ARGBToUV411Row = ARGBToUV411Row_Any_NEON; ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToUV411Row = ARGBToUV411Row_NEON; ARGBToUV411Row = ARGBToUV411Row_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
@ -261,9 +269,6 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
ARGBToYRow_C; ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C; int width) = MergeUVRow_C;
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb || if (!src_argb ||
!dst_y || !dst_uv || !dst_y || !dst_uv ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -296,13 +301,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
#if defined(HAS_MERGEUVROW_SSE2) #if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
@ -331,6 +338,10 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@ -347,6 +358,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb, dst_y, width);
} }
free_aligned_buffer_64(row_u); free_aligned_buffer_64(row_u);
}
return 0; return 0;
} }
@ -364,9 +376,6 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
ARGBToYRow_C; ARGBToYRow_C;
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C; int width) = MergeUVRow_C;
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
if (!src_argb || if (!src_argb ||
!dst_y || !dst_uv || !dst_y || !dst_uv ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
@ -399,13 +408,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
#if defined(HAS_MERGEUVROW_SSE2) #if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
@ -434,6 +445,10 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@ -450,6 +465,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb, dst_y, width);
} }
free_aligned_buffer_64(row_u); free_aligned_buffer_64(row_u);
}
return 0; return 0;
} }
@ -493,6 +509,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@ -510,12 +533,6 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
@ -594,6 +611,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
} }
} }
} }
#elif defined(HAS_ARGBTOUV422ROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
@ -611,12 +635,6 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) {
ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUV422Row = ARGBToUV422Row_NEON;
}
}
} }
#endif #endif
@ -1022,13 +1040,15 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON; ARGBToYJRow = ARGBToYJRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON; ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON; ARGBToUVJRow = ARGBToUVJRow_NEON;
} }
} }
}
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {

View File

@ -14,7 +14,7 @@
#include <intrin.h> // For __cpuidex() #include <intrin.h> // For __cpuidex()
#endif #endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \ #if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && defined(_M_X64) && \ !defined(__native_client__) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv() #include <immintrin.h> // For _xgetbv()
#endif #endif
@ -97,7 +97,7 @@ int TestOsSaveYmm() {
uint32 xcr0 = 0u; uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) #elif defined(_M_IX86) && defined(_MSC_VER)
__asm { __asm {
xor ecx, ecx // xcr 0 xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
@ -256,12 +256,17 @@ int InitCpuFlags(void) {
if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2; cpu_info_ &= ~kCpuHasMIPS_DSPR2;
} }
#elif defined(__arm__) #elif defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__ // gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
// For Linux, /proc/cpuinfo can be tested but without that assume Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
cpu_info_ = kCpuHasNEON; cpu_info_ = kCpuHasNEON;
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
// flag in it.
// So for aarch64, neon enabling is hard coded here.
#elif defined(__aarch64__)
cpu_info_ = kCpuHasNEON;
#else #else
// Linux arm parse text file for neon detect. // Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); cpu_info_ = ArmCpuCaps("/proc/cpuinfo");

View File

@ -332,13 +332,15 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON; ARGBToYRow = ARGBToYRow_NEON;
} }
if (width >= 16) { }
#endif
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON; ARGBToUVRow = ARGBToUVRow_NEON;
} }
} }
}
#endif #endif
switch (src_fourcc_bayer) { switch (src_fourcc_bayer) {

View File

@ -13,8 +13,8 @@
#ifdef HAVE_JPEG #ifdef HAVE_JPEG
#include <assert.h> #include <assert.h>
#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\ #if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
// Must be included before jpeglib. // Must be included before jpeglib.
#include <setjmp.h> #include <setjmp.h>
#define HAVE_SETJMP #define HAVE_SETJMP
@ -101,7 +101,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
} }
buf_.data = src; buf_.data = src;
buf_.len = (int)(src_len); buf_.len = static_cast<int>(src_len);
buf_vec_.pos = 0; buf_vec_.pos = 0;
decompress_struct_->client_data = &buf_vec_; decompress_struct_->client_data = &buf_vec_;
#ifdef HAVE_SETJMP #ifdef HAVE_SETJMP
@ -411,7 +411,7 @@ void init_source(j_decompress_ptr cinfo) {
} }
boolean fill_input_buffer(j_decompress_ptr cinfo) { boolean fill_input_buffer(j_decompress_ptr cinfo) {
BufferVector* buf_vec = (BufferVector*)(cinfo->client_data); BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
if (buf_vec->pos >= buf_vec->len) { if (buf_vec->pos >= buf_vec->len) {
assert(0 && "No more data"); assert(0 && "No more data");
// ERROR: No more data // ERROR: No more data
@ -447,7 +447,7 @@ void ErrorHandler(j_common_ptr cinfo) {
// ERROR: Error in jpeglib: buf // ERROR: Error in jpeglib: buf
#endif #endif
SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err); SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp() // This rewinds the call stack to the point of the corresponding setjmp()
// and causes it to return (for a second time) with value 1. // and causes it to return (for a second time) with value 1.
longjmp(mgr->setjmp_buffer, 1); longjmp(mgr->setjmp_buffer, 1);

View File

@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
1, 2, 7) 1, 2, 7)
YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOARGBROW_NEON #endif // HAS_I422TOARGBROW_NEON
#ifdef HAS_I422TOYUY2ROW_NEON
YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
#endif // HAS_I422TOYUY2ROW_NEON
#ifdef HAS_I422TOUYVYROW_NEON
YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
#endif // HAS_I422TOUYVYROW_NEON
#undef YANY #undef YANY
// Wrappers to handle odd width // Wrappers to handle odd width
@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
#endif
#ifdef HAS_YUY2TOYROW_NEON
YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_UYVYTOYROW_NEON
YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON
YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RAWTOARGBROW_NEON
YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
#endif
#ifdef HAS_RGB565TOARGBROW_NEON
YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
#endif #endif
#undef YANY #undef YANY
@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
#endif
#ifdef HAS_YUY2TOUVROW_NEON
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
#endif
#ifdef HAS_UYVYTOUVROW_NEON
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
#endif #endif
#undef UVANY #undef UVANY

View File

@ -824,19 +824,19 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V "st1 {v1.16b}, [%2], #16 \n" // store V
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_SPLITUVROW_NEON #endif // HAS_SPLITUVROW_NEON
@ -849,12 +849,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(2) MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"bgt 1b \n" "bgt 1b \n"
: :
"+r"(src_u), // %0 "+r"(src_u), // %0
@ -862,7 +862,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_MERGEUVROW_NEON #endif // HAS_MERGEUVROW_NEON
@ -874,16 +874,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(count) // %2 // Output registers "+r"(count) // %2 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
@ -892,16 +892,16 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#ifdef HAS_SETROW_NEON #ifdef HAS_SETROW_NEON
void SetRow_NEON(uint8* dst, uint32 v32, int count) { void SetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( asm volatile (
"vdup.u32 q0, %2 \n" // duplicate 4 ints "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"bgt 1b \n" "bgt 1b \n"
: "+r"(dst), // %0 : "+r"(dst), // %0
"+r"(count) // %1 "+r"(count) // %1
: "r"(v32) // %2 : "r"(v32) // %2
: "cc", "memory", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_SETROW_NEON #endif // HAS_SETROW_NEON
@ -922,26 +922,25 @@ void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r3, #-16 \n"
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop. "subs %2, %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n" "rev64 v0.16b, v0.16b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "r3", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_MIRRORROW_NEON #endif // HAS_MIRRORROW_NEON
@ -951,27 +950,27 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop. "subs %3, %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n" "rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" "st1 {v1.8b}, [%2], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 "+r"(width) // %3
: : "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "r12", "q0" : "cc", "memory", "v0", "v1"
); );
} }
#endif // HAS_MIRRORUVROW_NEON #endif // HAS_MIRRORUVROW_NEON
@ -980,26 +979,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"mov r3, #-16 \n"
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n" "sub %0, %0, #16 \n"
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop. "subs %2, %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n" "rev64 v0.4s, v0.4s \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" "st1 {v0.D}[0], [%1], #8 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "r3", "q0" : "cc", "memory", "v0"
); );
} }
#endif // HAS_ARGBMIRRORROW_NEON #endif // HAS_ARGBMIRRORROW_NEON
@ -1007,20 +1005,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
#ifdef HAS_RGB24TOARGBROW_NEON #ifdef HAS_RGB24TOARGBROW_NEON
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "movi v4.8b, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
#endif // HAS_RGB24TOARGBROW_NEON #endif // HAS_RGB24TOARGBROW_NEON
@ -1028,21 +1026,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
#ifdef HAS_RAWTOARGBROW_NEON #ifdef HAS_RAWTOARGBROW_NEON
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d4, #255 \n" // Alpha "movi v5.8b, #255 \n" // Alpha
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "mov v3.8b, v1.8b \n" // move g
"mov v4.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_RAWTOARGBROW_NEON #endif // HAS_RAWTOARGBROW_NEON
@ -1170,16 +1169,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1 "+r"(dst_rgb24), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
#endif // HAS_ARGBTORGB24ROW_NEON #endif // HAS_ARGBTORGB24ROW_NEON
@ -1190,17 +1189,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B "mov v4.8b, v2.8b \n" // mov g
"mov v5.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_raw), // %1 "+r"(dst_raw), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_ARGBTORAWROW_NEON #endif // HAS_ARGBTORAWROW_NEON
@ -1211,16 +1211,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_YUY2TOYROW_NEON #endif // HAS_YUY2TOYROW_NEON
@ -1231,16 +1231,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_UYVYTOYROW_NEON #endif // HAS_UYVYTOYROW_NEON
@ -1252,19 +1252,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V. "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_YUY2TOUV422ROW_NEON #endif // HAS_YUY2TOUV422ROW_NEON
@ -1276,19 +1276,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V. "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_UYVYTOUV422ROW_NEON #endif // HAS_UYVYTOUV422ROW_NEON
@ -1297,20 +1297,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U. "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V. "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1 "+r"(stride_yuy2), // %1
@ -1318,7 +1318,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_YUY2TOUVROW_NEON #endif // HAS_YUY2TOUVROW_NEON
@ -1327,20 +1327,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm volatile ( asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U. "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V. "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(stride_uyvy), // %1 "+r"(stride_uyvy), // %1
@ -1348,7 +1348,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"+r"(dst_v), // %3 "+r"(dst_v), // %3
"+r"(pix) // %4 "+r"(pix) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List
); );
} }
#endif // HAS_UYVYTOUVROW_NEON #endif // HAS_UYVYTOUVROW_NEON
@ -1358,23 +1358,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) { uint8* dst_uv, int pix) {
asm volatile ( asm volatile (
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %x1, %x0, %w1, sxtw \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" "st1 {v0.16b}, [%2], #16 \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(src_uv_stride), // %1 "+r"(src_uv_stride), // %1
"+r"(dst_uv), // %2 "+r"(dst_uv), // %2
"+r"(pix) // %3 "+r"(pix) // %3
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
#endif // HAS_HALFROW_NEON #endif // HAS_HALFROW_NEON
@ -1384,22 +1384,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) { uint32 selector, int pix) {
asm volatile ( asm volatile (
"vmov.u32 d6[0], %3 \n" // selector "mov v2.s[0], %w3 \n" // selector
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8. "st1 {v4.8b}, [%1], #8 \n" // store 8.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(selector) // %3 : "r"(selector) // %3
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List
); );
} }
#endif // HAS_ARGBTOBAYERROW_NEON #endif // HAS_ARGBTOBAYERROW_NEON
@ -1411,16 +1411,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile ( asm volatile (
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's. "st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_ARGBTOBAYERGGROW_NEON #endif // HAS_ARGBTOBAYERGGROW_NEON
@ -1431,21 +1431,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
asm volatile ( asm volatile (
MEMACCESS(3) MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler "ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4. "st1 {v1.16b}, [%1], #16 \n" // store 4.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: "r"(shuffler) // %3 : "r"(shuffler) // %3
: "cc", "memory", "q0", "q1", "q2" // Clobber List : "cc", "memory", "v0", "v1", "v2" // Clobber List
); );
} }
#endif // HAS_ARGBSHUFFLEROW_NEON #endif // HAS_ARGBSHUFFLEROW_NEON
@ -1459,14 +1458,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"mov v2.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
@ -1474,7 +1474,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"+r"(dst_yuy2), // %3 "+r"(dst_yuy2), // %3
"+r"(width) // %4 "+r"(width) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_I422TOYUY2ROW_NEON #endif // HAS_I422TOYUY2ROW_NEON
@ -1488,14 +1488,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys
"mov v3.8b, v2.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %4, %4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_u), // %1 "+r"(src_u), // %1
@ -1503,7 +1504,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"+r"(dst_uyvy), // %3 "+r"(dst_uyvy), // %3
"+r"(width) // %4 "+r"(width) // %4
: :
: "cc", "memory", "d0", "d1", "d2", "d3" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_I422TOUYVYROW_NEON #endif // HAS_I422TOUYVYROW_NEON
@ -1577,28 +1578,28 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
#ifdef HAS_ARGBTOYROW_NEON #ifdef HAS_ARGBTOYROW_NEON
void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "uqadd v0.8b, v0.8b, v7.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q12", "q13" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBTOYROW_NEON #endif // HAS_ARGBTOYROW_NEON
@ -1606,26 +1607,26 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
#ifdef HAS_ARGBTOYJROW_NEON #ifdef HAS_ARGBTOYJROW_NEON
void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile ( asm volatile (
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "movi v4.8b, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
MEMACCESS(1) MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(pix) // %2 "+r"(pix) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q12", "q13" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
); );
} }
#endif // HAS_ARGBTOYJROW_NEON #endif // HAS_ARGBTOYJROW_NEON
@ -3048,20 +3049,20 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"vmull.u8 q1, d2, d3 \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
"vmull.u8 q2, d4, d5 \n" // multiply R "umull v2.8h, v2.8b, v6.8b \n" // multiply R
"vmull.u8 q3, d6, d7 \n" // multiply A "umull v3.8h, v3.8b, v7.8b \n" // multiply A
"vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
"vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
"vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
@ -3069,7 +3070,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1", "q2", "q3" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBMULTIPLYROW_NEON #endif // HAS_ARGBMULTIPLYROW_NEON
@ -3083,14 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G "uqadd v0.8b, v0.8b, v4.8b \n"
"vqadd.u8 q1, q1, q3 \n" // add R, A "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
@ -3098,7 +3101,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1", "q2", "q3" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBADDROW_NEON #endif // HAS_ARGBADDROW_NEON
@ -3112,14 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G "uqsub v0.8b, v0.8b, v4.8b \n"
"vqsub.u8 q1, q1, q3 \n" // subtract R, A "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
@ -3127,7 +3132,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1", "q2", "q3" : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
); );
} }
#endif // HAS_ARGBSUBTRACTROW_NEON #endif // HAS_ARGBSUBTRACTROW_NEON
@ -3141,27 +3146,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add "uqadd v0.8b, v0.8b, v1.8b \n" // add
"vmov.u8 d1, d0 \n" "mov v1.8b, v0.8b \n"
"vmov.u8 d2, d0 \n" "mov v2.8b, v0.8b \n"
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_SOBELROW_NEON #endif // HAS_SOBELROW_NEON
@ -3175,20 +3180,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_y), // %2 "+r"(dst_y), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1"
); );
} }
#endif // HAS_SOBELTOPLANEROW_NEON #endif // HAS_SOBELTOPLANEROW_NEON
@ -3202,25 +3207,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"vmov.u8 d3, #255 \n" // alpha "movi v3.8b, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1" : "cc", "memory", "v0", "v1", "v2", "v3"
); );
} }
#endif // HAS_SOBELXYROW_NEON #endif // HAS_SOBELXYROW_NEON
@ -3236,28 +3241,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top "ld1 {v0.8b}, [%0],%5 \n" // top
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n" "ld1 {v1.8b}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n" "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n" "ld1 {v3.8b}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2) MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n" "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vabs.s16 q0, q0 \n" "abs v0.8h, v0.8h \n"
"vqmovn.u16 d0, q0 \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(3) MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
@ -3266,7 +3271,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %4 "+r"(width) // %4
: "r"(2), // %5 : "r"(2), // %5
"r"(6) // %6 "r"(6) // %6
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_SOBELXROW_NEON #endif // HAS_SOBELXROW_NEON
@ -3282,28 +3287,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
".p2align 2 \n" ".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left "ld1 {v0.8b}, [%0],%4 \n" // left
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n" "ld1 {v1.8b}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n" "usubl v0.8h, v0.8b, v1.8b \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n" "ld1 {v3.8b}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1) MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n" "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n" "usubl v1.8h, v2.8b, v3.8b \n"
"vadd.s16 q0, q0, q1 \n" "add v0.8h, v0.8h, v1.8h \n"
"vabs.s16 q0, q0 \n" "abs v0.8h, v0.8h \n"
"vqmovn.u16 d0, q0 \n" "uqxtn v0.8b, v0.8h \n"
MEMACCESS(2) MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
@ -3311,7 +3316,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %3 "+r"(width) // %3
: "r"(1), // %4 : "r"(1), // %4
"r"(6) // %5 "r"(6) // %5
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#endif // HAS_SOBELYROW_NEON #endif // HAS_SOBELYROW_NEON

View File

@ -10,7 +10,7 @@
#include "libyuv/row.h" #include "libyuv/row.h"
#if defined (_M_X64) #if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#include <emmintrin.h> #include <emmintrin.h>
#include <tmmintrin.h> // For _mm_maddubs_epi16 #include <tmmintrin.h> // For _mm_maddubs_epi16
#endif #endif
@ -78,7 +78,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
__m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128(); const __m128i xmm4 = _mm_setzero_si128();
@ -132,7 +131,6 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* v_buf, const uint8* v_buf,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
__m128i xmm0, xmm1, xmm2, xmm3; __m128i xmm0, xmm1, xmm2, xmm3;
const __m128i xmm5 = _mm_set1_epi8(-1); const __m128i xmm5 = _mm_set1_epi8(-1);
const __m128i xmm4 = _mm_setzero_si128(); const __m128i xmm4 = _mm_setzero_si128();