diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index 3869d25bc..acd52ed0e 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1060 +Version: 1305 License: BSD License File: LICENSE @@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: -cherry-pick 'Issue 24479004: Fix building with MSVC for arm' +None. diff --git a/third_party/libyuv/include/libyuv/compare.h b/third_party/libyuv/include/libyuv/compare.h index 5dfac7c86..08b2bb2ec 100644 --- a/third_party/libyuv/include/libyuv/compare.h +++ b/third_party/libyuv/include/libyuv/compare.h @@ -22,6 +22,11 @@ extern "C" { LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed); +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height); + // Sum Square Error - used to compute Mean Square Error or PSNR. LIBYUV_API uint64 ComputeSumSquareError(const uint8* src_a, diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h index 1bd45c837..97936adb2 100644 --- a/third_party/libyuv/include/libyuv/convert.h +++ b/third_party/libyuv/include/libyuv/convert.h @@ -113,15 +113,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert Q420 to I420. -LIBYUV_API -int Q420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - // ARGB little endian (bgra in memory) to I420. LIBYUV_API int ARGBToI420(const uint8* src_frame, int src_stride_frame, @@ -211,8 +202,6 @@ int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height); #endif -// Note Bayer formats (BGGR) To I420 are in format_conversion.h - // Convert camera sample to I420 with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. // "dst_stride_y" number of bytes in a row of the dst_y plane. diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h index a18014ca2..a0de89efd 100644 --- a/third_party/libyuv/include/libyuv/convert_argb.h +++ b/third_party/libyuv/include/libyuv/convert_argb.h @@ -18,7 +18,6 @@ #include "libyuv/rotate.h" // TODO(fbarchard): This set of functions should exactly match convert.h -// Add missing Q420. // TODO(fbarchard): Add tests. Create random content of right size and convert // with C vs Opt and or to I420 and compare. // TODO(fbarchard): Some of these functions lack parameter setting. @@ -104,13 +103,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, uint8* dst_argb, int dst_stride_argb, int width, int height); -// TODO(fbarchard): Convert Q420 to ARGB. -// LIBYUV_API -// int Q420ToARGB(const uint8* src_y, int src_stride_y, -// const uint8* src_yuy2, int src_stride_yuy2, -// uint8* dst_argb, int dst_stride_argb, -// int width, int height); - // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, @@ -123,6 +115,22 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // BGRA little endian (argb in memory) to ARGB. LIBYUV_API int BGRAToARGB(const uint8* src_frame, int src_stride_frame, @@ -184,8 +192,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, int dst_width, int dst_height); #endif -// Note Bayer formats (BGGR) to ARGB are in format_conversion.h. - // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h index b1cf57f7d..d6c0e3d87 100644 --- a/third_party/libyuv/include/libyuv/convert_from.h +++ b/third_party/libyuv/include/libyuv/convert_from.h @@ -57,7 +57,6 @@ int I400Copy(const uint8* src_y, int src_stride_y, int width, int height); // TODO(fbarchard): I420ToM420 -// TODO(fbarchard): I420ToQ420 LIBYUV_API int I420ToNV12(const uint8* src_y, int src_stride_y, @@ -152,8 +151,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); -// Note Bayer formats (BGGR) To I420 are in format_conversion.h. - // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h index 90f43af04..c592fc235 100644 --- a/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height); +// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). +// Values in dither matrix from 0 to 255. 128 is best for no dither. +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither8x8, int width, int height); + // Convert ARGB To ARGB1555. LIBYUV_API int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, @@ -105,6 +112,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, uint8* dst_v, int dst_stride_v, int width, int height); +// Convert ARGB to J422. +LIBYUV_API +int ARGBToJ422(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // Convert ARGB To I411. LIBYUV_API int ARGBToI411(const uint8* src_argb, int src_stride_argb, diff --git a/third_party/libyuv/include/libyuv/format_conversion.h b/third_party/libyuv/include/libyuv/format_conversion.h deleted file mode 100644 index b18bf0534..000000000 --- a/third_party/libyuv/include/libyuv/format_conversion.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT -#define INCLUDE_LIBYUV_FORMATCONVERSION_H_ - -#include "libyuv/basic_types.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// Convert Bayer RGB formats to I420. -LIBYUV_API -int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -LIBYUV_API -int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height); - -// Temporary API mapper. -#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \ - BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f) - -LIBYUV_API -int BayerToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, - uint32 src_fourcc_bayer); - -// Convert I420 to Bayer RGB formats. -LIBYUV_API -int I420ToBayerBGGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -LIBYUV_API -int I420ToBayerGBRG(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -LIBYUV_API -int I420ToBayerGRBG(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -LIBYUV_API -int I420ToBayerRGGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height); - -// Temporary API mapper. -#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \ - I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f) - -LIBYUV_API -int I420ToBayer(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_frame, int dst_stride_frame, - int width, int height, - uint32 dst_fourcc_bayer); - -// Convert Bayer RGB formats to ARGB. -LIBYUV_API -int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -LIBYUV_API -int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -LIBYUV_API -int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -LIBYUV_API -int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - -// Temporary API mapper. -#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f) - -LIBYUV_API -int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height, - uint32 src_fourcc_bayer); - -// Converts ARGB to Bayer RGB formats. -LIBYUV_API -int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height); - -LIBYUV_API -int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height); - -LIBYUV_API -int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height); - -LIBYUV_API -int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height); - -// Temporary API mapper. -#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f) - -LIBYUV_API -int ARGBToBayer(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height, - uint32 dst_fourcc_bayer); - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif - -#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index 4b3c870f9..80e844bae 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -15,10 +15,6 @@ #include "libyuv/basic_types.h" -#if defined(__native_client__) -#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE -#endif - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -43,6 +39,7 @@ extern "C" { #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ defined(TARGET_IPHONE_SIMULATOR) || \ + (defined(__i386__) && !defined(__SSE2__)) || \ (defined(_MSC_VER) && defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif @@ -51,48 +48,16 @@ extern "C" { #define LIBYUV_SSSE3_ONLY #endif -// Enable for NaCL pepper 33 for bundle and AVX2 support. -#if defined(__native_client__) && PPAPI_RELEASE >= 33 -#define NEW_BINUTILS -#endif -#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 +// clang >= 3.5.0 required for Arm64. +#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) +#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) #define LIBYUV_DISABLE_NEON -#endif +#endif // clang >= 3.5 +#endif // __clang__ // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -// Effects: -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 -#define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBCOPYALPHAROW_SSE2 -#define HAS_ARGBCOPYYTOALPHAROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSSE3 -#define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBPOLYNOMIALROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSE2 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_RGBCOLORTABLEROW_X86 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELTOPLANEROW_SSE2 -#define HAS_SOBELXROW_SSE2 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSE2 - // Conversions: #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 @@ -103,24 +68,21 @@ extern "C" { #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTOBAYERGGROW_SSE2 -#define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 -#define HAS_COPYROW_X86 -#define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 -#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -133,6 +95,7 @@ extern "C" { #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 +// #define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 @@ -150,6 +113,8 @@ extern "C" { #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 +#define HAS_SETROW_ERMS +#define HAS_ARGBSETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 @@ -160,6 +125,36 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 + +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSE2 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 #endif // The following are available on x64 Visual C: @@ -186,26 +181,39 @@ extern "C" { #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 +// The following are available require VS2012. Port to GCC. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 +#define HAS_I422TOABGRROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TOBGRAROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_ARGBTOARGB1555ROW_AVX2 +#define HAS_ARGBTOARGB4444ROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 +#endif + // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) -// Effects: -#define HAS_ARGBPOLYNOMIALROW_AVX2 -#define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 -#endif - -// The following are require VS2012. -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_HALFROW_AVX2 -#define HAS_I422TOARGBROW_AVX2 +#define HAS_COPYROW_AVX #define HAS_INTERPOLATEROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 @@ -213,18 +221,25 @@ extern "C" { #define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 +#define HAS_YTOARGBROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 +// The following require HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TOARGBROW_AVX2) +#define HAS_YUY2TOARGBROW_AVX2 +#define HAS_UYVYTOARGBROW_AVX2 +#endif + // Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 -#define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 -#endif // defined(VISUALC_HAS_AVX2) +#endif + // The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. @@ -245,106 +260,14 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBATTENUATEROW_SSE2 +#define HAS_ARGBBLENDROW_SSE2 #define HAS_MIRRORROW_SSE2 #endif -// The following are available on arm64 platforms: -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -// #define HAS_I444TOARGBROW_NEON -// #define HAS_I422TOARGBROW_NEON -// #define HAS_I411TOARGBROW_NEON -// #define HAS_I422TOBGRAROW_NEON -// #define HAS_I422TOABGRROW_NEON -// #define HAS_I422TORGBAROW_NEON -// #define HAS_I422TORGB24ROW_NEON -// #define HAS_I422TORAWROW_NEON -// #define HAS_I422TORGB565ROW_NEON -// #define HAS_I422TOARGB1555ROW_NEON -// #define HAS_I422TOARGB4444ROW_NEON -// #define HAS_YTOARGBROW_NEON -// #define HAS_I400TOARGBROW_NEON -// #define HAS_NV12TOARGBROW_NEON -// #define HAS_NV21TOARGBROW_NEON -// #define HAS_NV12TORGB565ROW_NEON -// #define HAS_NV21TORGB565ROW_NEON -// #define HAS_YUY2TOARGBROW_NEON -// #define HAS_UYVYTOARGBROW_NEON -#define HAS_SPLITUVROW_NEON -#define HAS_MERGEUVROW_NEON -#define HAS_COPYROW_NEON -#define HAS_SETROW_NEON -#define HAS_ARGBSETROWS_NEON -#define HAS_MIRRORROW_NEON -#define HAS_MIRRORUVROW_NEON -#define HAS_ARGBMIRRORROW_NEON -#define HAS_RGB24TOARGBROW_NEON -#define HAS_RAWTOARGBROW_NEON -// #define HAS_RGB565TOARGBROW_NEON -// #define HAS_ARGB1555TOARGBROW_NEON -// #define HAS_ARGB4444TOARGBROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORAWROW_NEON -#define HAS_YUY2TOYROW_NEON -#define HAS_UYVYTOYROW_NEON -#define HAS_YUY2TOUV422ROW_NEON -#define HAS_UYVYTOUV422ROW_NEON -#define HAS_YUY2TOUVROW_NEON -#define HAS_UYVYTOUVROW_NEON -#define HAS_HALFROW_NEON -#define HAS_ARGBTOBAYERROW_NEON -#define HAS_ARGBTOBAYERGGROW_NEON -#define HAS_ARGBSHUFFLEROW_NEON -#define HAS_I422TOYUY2ROW_NEON -#define HAS_I422TOUYVYROW_NEON -// #define HAS_ARGBTORGB565ROW_NEON -// #define HAS_ARGBTOARGB1555ROW_NEON -// #define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTOYROW_NEON -#define HAS_ARGBTOYJROW_NEON -// #define HAS_ARGBTOUV444ROW_NEON -// #define HAS_ARGBTOUV422ROW_NEON -// #define HAS_ARGBTOUV411ROW_NEON -// #define HAS_ARGBTOUVROW_NEON -// #define HAS_ARGBTOUVJROW_NEON -// #define HAS_BGRATOUVROW_NEON -// #define HAS_ABGRTOUVROW_NEON -// #define HAS_RGBATOUVROW_NEON -// #define HAS_RGB24TOUVROW_NEON -// #define HAS_RAWTOUVROW_NEON -// #define HAS_RGB565TOUVROW_NEON -// #define HAS_ARGB1555TOUVROW_NEON -// #define HAS_ARGB4444TOUVROW_NEON -// #define HAS_RGB565TOYROW_NEON -// #define HAS_ARGB1555TOYROW_NEON -// #define HAS_ARGB4444TOYROW_NEON -// #define HAS_BGRATOYROW_NEON -// #define HAS_ABGRTOYROW_NEON -// #define HAS_RGBATOYROW_NEON -// #define HAS_RGB24TOYROW_NEON -// #define HAS_RAWTOYROW_NEON -// #define HAS_INTERPOLATEROW_NEON -// #define HAS_ARGBBLENDROW_NEON -// #define HAS_ARGBATTENUATEROW_NEON -// #define HAS_ARGBQUANTIZEROW_NEON -// #define HAS_ARGBSHADEROW_NEON -// #define HAS_ARGBGRAYROW_NEON -// #define HAS_ARGBSEPIAROW_NEON -// #define HAS_ARGBCOLORMATRIXROW_NEON -#define HAS_ARGBMULTIPLYROW_NEON -#define HAS_ARGBADDROW_NEON -#define HAS_ARGBSUBTRACTROW_NEON -#define HAS_SOBELROW_NEON -#define HAS_SOBELTOPLANEROW_NEON -#define HAS_SOBELXYROW_NEON -#define HAS_SOBELXROW_NEON -#define HAS_SOBELYROW_NEON -#endif - // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -355,7 +278,6 @@ extern "C" { #define HAS_ARGB4444TOYROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTOBAYERGGROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON @@ -363,14 +285,13 @@ extern "C" { #define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV444ROW_NEON -#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVJROW_NEON -#define HAS_ARGBTOYROW_NEON +#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON +#define HAS_ARGBTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON -#define HAS_HALFROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON #define HAS_I422TOABGRROW_NEON @@ -404,6 +325,7 @@ extern "C" { #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON +#define HAS_ARGBSETROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON @@ -426,25 +348,25 @@ extern "C" { #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON +#define HAS_INTERPOLATEROW_NEON #define HAS_SOBELROW_NEON #define HAS_SOBELTOPLANEROW_NEON -#define HAS_SOBELXYROW_NEON #define HAS_SOBELXROW_NEON +#define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON -#define HAS_INTERPOLATEROW_NEON -// TODO(fbarchard): Investigate neon unittest failure. -// #define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON #endif // The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_I422TOABGRROW_MIPS_DSPR2 #define HAS_I422TOARGBROW_MIPS_DSPR2 #define HAS_I422TOBGRAROW_MIPS_DSPR2 -#define HAS_INTERPOLATEROWS_MIPS_DSPR2 +#define HAS_INTERPOLATEROW_MIPS_DSPR2 #define HAS_MIRRORROW_MIPS_DSPR2 #define HAS_MIRRORUVROW_MIPS_DSPR2 #define HAS_SPLITUVROW_MIPS_DSPR2 @@ -453,6 +375,7 @@ extern "C" { #if defined(_MSC_VER) && !defined(__CLR_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var +#define SIMD_ALIGNED32(var) __declspec(align(64)) var typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) int8 vec8[16]; @@ -469,20 +392,34 @@ typedef __declspec(align(32)) uint8 ulvec8[32]; #elif defined(__GNUC__) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#define SIMD_ALIGNED32(var) var __attribute__((aligned(64))) typedef int16 __attribute__((vector_size(16))) vec16; typedef int32 __attribute__((vector_size(16))) vec32; typedef int8 __attribute__((vector_size(16))) vec8; typedef uint16 __attribute__((vector_size(16))) uvec16; typedef uint32 __attribute__((vector_size(16))) uvec32; typedef uint8 __attribute__((vector_size(16))) uvec8; +typedef int16 __attribute__((vector_size(32))) lvec16; +typedef int32 __attribute__((vector_size(32))) lvec32; +typedef int8 __attribute__((vector_size(32))) lvec8; +typedef uint16 __attribute__((vector_size(32))) ulvec16; +typedef uint32 __attribute__((vector_size(32))) ulvec32; +typedef uint8 __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var +#define SIMD_ALIGNED32(var) var typedef int16 vec16[8]; typedef int32 vec32[4]; typedef int8 vec8[16]; typedef uint16 uvec16[8]; typedef uint32 uvec32[4]; typedef uint8 uvec8[16]; +typedef int16 lvec16[16]; +typedef int32 lvec32[8]; +typedef int8 lvec8[32]; +typedef uint16 ulvec16[16]; +typedef uint32 ulvec32[8]; +typedef uint8 ulvec8[32]; #endif #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) @@ -492,24 +429,16 @@ typedef uint8 uvec8[16]; #endif // NaCL macros for GCC x86 and x64. - -// TODO(nfullagar): When pepper_33 toolchain is distributed, default to -// NEW_BINUTILS and remove all BUNDLEALIGN occurances. #if defined(__native_client__) #define LABELALIGN ".p2align 5\n" #else -#define LABELALIGN ".p2align 2\n" +#define LABELALIGN #endif #if defined(__native_client__) && defined(__x86_64__) -#if defined(NEW_BINUTILS) +// r14 is used for MEMOP macros. +#define NACL_R14 "r14", #define BUNDLELOCK ".bundle_lock\n" #define BUNDLEUNLOCK ".bundle_unlock\n" -#define BUNDLEALIGN "\n" -#else -#define BUNDLELOCK "\n" -#define BUNDLEUNLOCK "\n" -#define BUNDLEALIGN ".p2align 5\n" -#endif #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" #define MEMLEA(offset, base) #offset "(%q" #base ")" @@ -534,8 +463,19 @@ typedef uint8 uvec8[16]; "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ #opcode " (%%r15,%%r14),%" #arg "\n" \ BUNDLEUNLOCK +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \ + BUNDLEUNLOCK +#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ + BUNDLELOCK \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \ + BUNDLEUNLOCK #else // defined(__native_client__) && defined(__x86_64__) -#define BUNDLEALIGN "\n" +#define NACL_R14 +#define BUNDLEALIGN #define MEMACCESS(base) "(%" #base ")" #define MEMACCESS2(offset, base) #offset "(%" #base ")" #define MEMLEA(offset, base) #offset "(%" #base ")" @@ -551,14 +491,19 @@ typedef uint8 uvec8[16]; #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" #define MEMOPARG(opcode, offset, base, index, scale, arg) \ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n" +#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \ + #reg2 "\n" +#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \ + #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" #endif // defined(__native_client__) && defined(__x86_64__) #if defined(__arm__) || defined(__aarch64__) #undef MEMACCESS #if defined(__native_client__) -#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" #else -#define MEMACCESS(base) "\n" +#define MEMACCESS(base) #endif #endif @@ -651,13 +596,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix); -void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix); -void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix); void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, @@ -736,16 +674,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width); -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -807,15 +735,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); @@ -825,6 +749,8 @@ void ARGBToUV422Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUV411Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJ422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width); void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); @@ -832,6 +758,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); +void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); @@ -843,9 +773,12 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); -void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width); void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width); +void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width); void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); @@ -853,10 +786,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int pix); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -874,8 +803,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width); void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, @@ -884,11 +811,14 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); -void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count); +void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count); +void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count); void CopyRow_16_C(const uint16* src, uint16* dst, int count); @@ -900,15 +830,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void SetRow_X86(uint8* dst, uint32 v32, int count); -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height); -void SetRow_NEON(uint8* dst, uint32 v32, int count); -void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, - int dst_stride, int height); -void SetRow_C(uint8* dst, uint32 v32, int count); -void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, - int height); +void SetRow_C(uint8* dst, uint8 v8, int count); +void SetRow_X86(uint8* dst, uint8 v8, int count); +void SetRow_ERMS(uint8* dst, uint8 v8, int count); +void SetRow_NEON(uint8* dst, uint8 v8, int count); +void SetRow_Any_X86(uint8* dst, uint8 v8, int count); +void SetRow_Any_NEON(uint8* dst, uint8 v8, int count); + +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, @@ -921,8 +853,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, @@ -975,6 +905,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); + void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -988,8 +922,10 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix); + void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); @@ -1032,6 +968,11 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2, void UYVYToARGBRow_C(const uint8* src_uyvy, uint8* dst_argb, int width); +void J422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToBGRARow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1072,14 +1013,26 @@ void I422ToRGB565Row_C(const uint8* src_y, const uint8* src_v, uint8* dst_rgb565, int width); -void YToARGBRow_C(const uint8* src_y, - uint8* dst_argb, - int width); void I422ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToBGRARow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1103,6 +1056,14 @@ void NV21ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1111,12 +1072,31 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width); void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width); +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_AVX2(const uint8* src_uyvy, + uint8* dst_argb, + int width); +void J422ToARGBRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToBGRARow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1137,17 +1117,31 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); -// RGB24/RAW are unaligned. +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1158,56 +1152,26 @@ void I422ToRAWRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_raw, int width); - -void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_argb, - int width); -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_argb, - int width); -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - int width); -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width); -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width); -void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_bgra, - int width); -void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_abgr, - int width); -void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgba, - int width); void I422ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToBGRARow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToRGBARow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void I422ToABGRRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1231,6 +1195,14 @@ void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1239,12 +1211,26 @@ void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width); void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width); +void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, + uint8* dst_argb, + int width); +void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, + uint8* dst_argb, + int width); void I422ToBGRARow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1265,17 +1251,31 @@ void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, int width); -// RGB24/RAW are unaligned. +void I422ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1286,15 +1286,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); + +void YToARGBRow_C(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void YToARGBRow_AVX2(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void YToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void YToARGBRow_Any_AVX2(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); @@ -1365,6 +1375,10 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); + void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -1489,12 +1503,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix); -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -1530,12 +1538,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix); -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -1568,28 +1570,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); -void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix); - -void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, - uint16* dst_uv, int pix); - -void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix); -void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix); -void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix); -void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix); -void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix); void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, uint32 /* selector */, int pix); void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, @@ -1736,15 +1716,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); @@ -1757,9 +1731,9 @@ void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); +void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, ptrdiff_t src_stride_ptr, diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h index a3bc07e0f..102158d1a 100644 --- a/third_party/libyuv/include/libyuv/scale.h +++ b/third_party/libyuv/include/libyuv/scale.h @@ -34,6 +34,7 @@ void ScalePlane(const uint8* src, int src_stride, int dst_width, int dst_height, enum FilterMode filtering); +LIBYUV_API void ScalePlane_16(const uint16* src, int src_stride, int src_width, int src_height, uint16* dst, int dst_stride, diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 3c495424f..27aa04b22 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -44,21 +44,13 @@ extern "C" { // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON -#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__aarch64__) || defined(LIBYUV_NEON)) -/* #define HAS_SCALEROWDOWN2_NEON */ -/* #define HAS_SCALEROWDOWN4_NEON */ -/* #define HAS_SCALEROWDOWN34_NEON */ -/* #define HAS_SCALEROWDOWN38_NEON */ -/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */ -/* #define HAS_SCALEARGBROWDOWN2_NEON */ #endif // The following are available on Mips platforms: @@ -208,15 +200,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -267,10 +250,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); // Row functions. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h index 73a7f1b01..9236a7fa1 100644 --- a/third_party/libyuv/include/libyuv/version.h +++ b/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1059 +#define LIBYUV_VERSION 1305 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h index 91acc2ffc..cb6582f24 100644 --- a/third_party/libyuv/include/libyuv/video_common.h +++ b/third_party/libyuv/include/libyuv/video_common.h @@ -62,7 +62,7 @@ enum FourCC { // 2 Secondary YUV formats: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated. // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp. FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), @@ -75,7 +75,7 @@ enum FourCC { FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. - // 4 Secondary RGB formats: 4 Bayer Patterns. + // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated. FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc index dc715e019..f84a08ee6 100644 --- a/third_party/libyuv/source/compare.cc +++ b/third_party/libyuv/source/compare.cc @@ -19,6 +19,7 @@ #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/row.h" +#include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { @@ -78,6 +79,54 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { return seed; } +static uint32 ARGBDetectRow_C(const uint8* argb, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. + return FOURCC_BGRA; + } + if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255. + return FOURCC_ARGB; + } + argb += 8; + } + if (width & 1) { + if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. + return FOURCC_BGRA; + } + if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. + return FOURCC_ARGB; + } + } + return 0; +} + +// Scan an opaque argb image and return fourcc based on alpha offset. +// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. +LIBYUV_API +uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { + uint32 fourcc = 0; + int h; + + // Coalesce rows. + if (stride_argb == width * 4) { + width *= height; + height = 1; + stride_argb = 0; + } + for (h = 0; h < height && fourcc == 0; ++h) { + fourcc = ARGBDetectRow_C(argb, width); + argb += stride_argb; + } + return fourcc; +} + uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -114,8 +163,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #if defined(HAS_SUMSQUAREERROR_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc index 55052c0ee..ef006ec41 100644 --- a/third_party/libyuv/source/compare_neon.cc +++ b/third_party/libyuv/source/compare_neon.cc @@ -16,7 +16,8 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { volatile uint32 sse; @@ -56,46 +57,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { return sse; } -#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" - - ".p2align 2 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %2, %2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "bgt 1b \n" - - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); - return sse; -} - -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/compare_neon64.cc b/third_party/libyuv/source/compare_neon64.cc new file mode 100644 index 000000000..cc078f84c --- /dev/null +++ b/third_party/libyuv/source/compare_neon64.cc @@ -0,0 +1,63 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/compare_posix.cc b/third_party/libyuv/source/compare_posix.cc index ac361190e..247cb33bb 100644 --- a/third_party/libyuv/source/compare_posix.cc +++ b/third_party/libyuv/source/compare_posix.cc @@ -25,11 +25,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "lea " MEMLEA(0x10, 1) ",%1 \n" - "sub $0x10,%2 \n" "movdqa %%xmm1,%%xmm3 \n" "psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm3,%%xmm2 \n" @@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "pmaddwd %%xmm2,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" "jg 1b \n" "pshufd $0xee,%%xmm0,%%xmm1 \n" @@ -53,11 +53,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "+r"(src_b), // %1 "+r"(count), // %2 "=g"(sse) // %3 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); // NOLINT return sse; } @@ -124,13 +120,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { "pmulld %%xmm5,%%xmm1 \n" "paddd %%xmm4,%%xmm3 \n" "paddd %%xmm2,%%xmm1 \n" - "sub $0x10,%1 \n" "paddd %%xmm3,%%xmm1 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" "jg 1b \n" "movd %%xmm0,%3 \n" : "+r"(src), // %0 @@ -143,9 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { "m"(kHashMul2), // %7 "m"(kHashMul3) // %8 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif ); // NOLINT return hash; } diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc index 99831651f..e99009a21 100644 --- a/third_party/libyuv/source/compare_win.cc +++ b/third_party/libyuv/source/compare_win.cc @@ -27,13 +27,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { pxor xmm0, xmm0 pxor xmm5, xmm5 - align 4 wloop: - movdqa xmm1, [eax] + movdqu xmm1, [eax] lea eax, [eax + 16] - movdqa xmm2, [edx] + movdqu xmm2, [edx] lea edx, [edx + 16] - sub ecx, 16 movdqa xmm3, xmm1 // abs trick psubusb xmm1, xmm2 psubusb xmm2, xmm3 @@ -45,6 +43,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { pmaddwd xmm2, xmm2 paddd xmm0, xmm1 paddd xmm0, xmm2 + sub ecx, 16 jg wloop pshufd xmm1, xmm0, 0xee @@ -70,12 +69,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax - align 4 wloop: vmovdqu ymm1, [eax] vmovdqu ymm2, [eax + edx] lea eax, [eax + 32] - sub ecx, 32 vpsubusb ymm3, ymm1, ymm2 // abs difference trick vpsubusb ymm2, ymm2, ymm1 vpor ymm1, ymm2, ymm3 @@ -85,6 +82,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vpmaddwd ymm1, ymm1, ymm1 vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm2 + sub ecx, 32 jg wloop vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. @@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { pxor xmm7, xmm7 // constant 0 for unpck movdqa xmm6, kHash16x33 - align 4 wloop: movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] @@ -170,7 +167,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { pmulld(0xcd) // pmulld xmm1, xmm5 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 - sub ecx, 16 paddd xmm1, xmm3 pshufd xmm2, xmm1, 0x0e // upper 2 dwords @@ -178,6 +174,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 + sub ecx, 16 jg wloop movd eax, xmm0 // return hash @@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { movd xmm0, [esp + 12] // seed movdqa xmm6, kHash16x33 - align 4 wloop: vpmovzxbd xmm3, dword ptr [eax] // src[0-3] pmulld xmm0, xmm6 // hash *= 33 ^ 16 @@ -209,13 +205,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { pmulld xmm1, kHashMul3 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 - sub ecx, 16 paddd xmm1, xmm3 pshufd xmm2, xmm1, 0x0e // upper 2 dwords paddd xmm1, xmm2 pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 + sub ecx, 16 jg wloop movd eax, xmm0 // return hash diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc index a8e294f47..41696c18f 100644 --- a/third_party/libyuv/source/convert.cc +++ b/third_party/libyuv/source/convert.cc @@ -188,17 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, int width, int height) { int y; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src, 16) && - IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - CopyRow = CopyRow_SSE2; +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -207,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, } #endif #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MIPS) @@ -283,20 +280,15 @@ static int X420ToI420(const uint8* src_y, src_stride_uv = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_SPLITUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - SplitUVRow = SplitUVRow_SSE2; - } + SplitUVRow = SplitUVRow_SSE2; } } #endif #if defined(HAS_SPLITUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { SplitUVRow = SplitUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { SplitUVRow = SplitUVRow_AVX2; @@ -304,7 +296,7 @@ static int X420ToI420(const uint8* src_y, } #endif #if defined(HAS_SPLITUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow = SplitUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { SplitUVRow = SplitUVRow_NEON; @@ -312,15 +304,13 @@ static int X420ToI420(const uint8* src_y, } #endif #if defined(HAS_SPLITUVROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && + IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && + IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { SplitUVRow = SplitUVRow_Any_MIPS_DSPR2; if (IS_ALIGNED(halfwidth, 16)) { - SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2; - if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) && - IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) && - IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) { - SplitUVRow = SplitUVRow_MIPS_DSPR2; - } + SplitUVRow = SplitUVRow_MIPS_DSPR2; } } #endif @@ -391,125 +381,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420, width, height); } -// Convert Q420 to I420. -// Format is rows of YY/YUYV -LIBYUV_API -int Q420ToI420(const uint8* src_y, int src_stride_y, - const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - int y; - int halfheight; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; - void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int pix) = YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = - YUY2ToYRow_C; - if (!src_y || !src_yuy2 || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - // CopyRow for rows of just Y in Q420 copied to Y plane of I420. -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - CopyRow = CopyRow_SSE2; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif - -#if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } - } - } -#endif -#if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; - YUY2ToYRow = YUY2ToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - YUY2ToUV422Row = YUY2ToUV422Row_AVX2; - YUY2ToYRow = YUY2ToYRow_AVX2; - } - } -#endif -#if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width >= 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } - if (IS_ALIGNED(width, 16)) { - YUY2ToYRow = YUY2ToYRow_NEON; - YUY2ToUV422Row = YUY2ToUV422Row_NEON; - } - } -#endif - - for (y = 0; y < height - 1; y += 2) { - CopyRow(src_y, dst_y, width); - src_y += src_stride_y; - dst_y += dst_stride_y; - - YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); - YUY2ToYRow(src_yuy2, dst_y, width); - src_yuy2 += src_stride_yuy2; - dst_y += dst_stride_y; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - CopyRow(src_y, dst_y, width); - YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); - } - return 0; -} - // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, @@ -529,23 +400,17 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } #if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif #if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -555,11 +420,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } #endif #if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width >= 16) { - YUY2ToUVRow = YUY2ToUVRow_Any_NEON; - } + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUVRow = YUY2ToUVRow_NEON; @@ -602,23 +465,17 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } #if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { UYVYToUVRow = UYVYToUVRow_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif #if defined(HAS_UYVYTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { UYVYToUVRow = UYVYToUVRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -628,11 +485,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } #endif #if defined(HAS_UYVYTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; - if (width >= 16) { - UYVYToUVRow = UYVYToUVRow_Any_NEON; - } + UYVYToUVRow = UYVYToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUVRow = UYVYToUVRow_NEON; @@ -680,23 +535,17 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -706,7 +555,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -714,7 +563,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; @@ -761,34 +610,31 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } -#if defined(HAS_BGRATOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { +#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { BGRAToUVRow = BGRAToUVRow_Any_SSSE3; BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3; - BGRAToYRow = BGRAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) { - BGRAToUVRow = BGRAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - BGRAToYRow = BGRAToYRow_SSSE3; - } - } + BGRAToUVRow = BGRAToUVRow_SSSE3; + BGRAToYRow = BGRAToYRow_SSSE3; } } -#elif defined(HAS_BGRATOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_BGRATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { BGRAToYRow = BGRAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_NEON; } - if (width >= 16) { + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { BGRAToUVRow = BGRAToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_NEON; } } - } #endif for (y = 0; y < height - 1; y += 2) { @@ -830,32 +676,29 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } -#if defined(HAS_ABGRTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { +#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3; - ABGRToYRow = ABGRToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) { - ABGRToUVRow = ABGRToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ABGRToYRow = ABGRToYRow_SSSE3; - } - } + ABGRToUVRow = ABGRToUVRow_SSSE3; + ABGRToYRow = ABGRToYRow_SSSE3; } } -#elif defined(HAS_ABGRTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ABGRTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } - if (width >= 16) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; } } #endif @@ -899,32 +742,29 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } -#if defined(HAS_RGBATOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { +#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { RGBAToUVRow = RGBAToUVRow_Any_SSSE3; RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3; - RGBAToYRow = RGBAToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) { - RGBAToUVRow = RGBAToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - RGBAToYRow = RGBAToYRow_SSSE3; - } - } + RGBAToUVRow = RGBAToUVRow_SSSE3; + RGBAToYRow = RGBAToYRow_SSSE3; } } -#elif defined(HAS_RGBATOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_RGBATOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { RGBAToYRow = RGBAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_NEON; } - if (width >= 16) { - RGBAToUVRow = RGBAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; } } #endif @@ -978,22 +818,23 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } #if defined(HAS_RGB24TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { RGB24ToYRow = RGB24ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; } - if (width >= 16) { - RGB24ToUVRow = RGB24ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; } } -#else // HAS_RGB24TOYROW_NEON - +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; @@ -1001,7 +842,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; @@ -1009,17 +850,13 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RGB24TOYROW_NEON { #if !defined(HAS_RGB24TOYROW_NEON) @@ -1095,22 +932,23 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } #if defined(HAS_RAWTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { RAWToYRow = RAWToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; } - if (width >= 16) { - RAWToUVRow = RAWToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } + } +#endif +#if defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; } } -#else // HAS_RAWTOYROW_NEON - +#endif #if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; @@ -1118,7 +956,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; @@ -1126,17 +964,13 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RAWTOYROW_NEON { // Allocate 2 rows of ARGB. @@ -1210,22 +1044,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } #if defined(HAS_RGB565TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { RGB565ToYRow = RGB565ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB565ToYRow = RGB565ToYRow_NEON; } - if (width >= 16) { - RGB565ToUVRow = RGB565ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB565ToUVRow = RGB565ToUVRow_NEON; - } + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; } } #else // HAS_RGB565TOYROW_NEON #if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; @@ -1233,7 +1065,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; @@ -1241,13 +1073,10 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1327,22 +1156,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } #if defined(HAS_ARGB1555TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToYRow = ARGB1555ToYRow_NEON; } - if (width >= 16) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; - } + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; } } #else // HAS_ARGB1555TOYROW_NEON #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; @@ -1350,7 +1177,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; @@ -1358,13 +1185,10 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 @@ -1445,22 +1269,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #if defined(HAS_ARGB4444TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToYRow = ARGB4444ToYRow_NEON; } - if (width >= 16) { - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; - } + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; } } #else // HAS_ARGB4444TOYROW_NEON #if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; @@ -1468,7 +1290,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; @@ -1476,13 +1298,10 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif // HAS_ARGBTOUVROW_SSSE3 diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc index ac0bc3d15..66f766079 100644 --- a/third_party/libyuv/source/convert_argb.cc +++ b/third_party/libyuv/source/convert_argb.cc @@ -11,7 +11,6 @@ #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" -#include "libyuv/format_conversion.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif @@ -79,17 +78,15 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } #if defined(HAS_I444TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; - } + I444ToARGBRow = I444ToARGBRow_SSSE3; } } -#elif defined(HAS_I444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I444ToARGBRow = I444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I444ToARGBRow = I444ToARGBRow_NEON; @@ -141,18 +138,15 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } #if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; @@ -160,7 +154,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; @@ -221,17 +215,15 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } #if defined(HAS_I411TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I411ToARGBRow = I411ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I411ToARGBRow = I411ToARGBRow_SSSE3; - } + I411ToARGBRow = I411ToARGBRow_SSSE3; } } -#elif defined(HAS_I411TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I411TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I411ToARGBRow = I411ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I411ToARGBRow = I411ToARGBRow_NEON; @@ -276,15 +268,23 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_YTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { YToARGBRow = YToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { YToARGBRow = YToARGBRow_SSE2; } } -#elif defined(HAS_YTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_YTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YToARGBRow = YToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + YToARGBRow = YToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_YTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { YToARGBRow = YToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { YToARGBRow = YToARGBRow_NEON; @@ -326,17 +326,15 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_I400TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + if (TestCpuFlag(kCpuHasSSE2)) { I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I400ToARGBRow = I400ToARGBRow_SSE2; - } + I400ToARGBRow = I400ToARGBRow_SSE2; } } -#elif defined(HAS_I400TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I400ToARGBRow = I400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I400ToARGBRow = I400ToARGBRow_NEON; @@ -447,15 +445,15 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = dst_stride_argb = 0; } #if defined(HAS_RGB24TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } -#elif defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToARGBRow = RGB24ToARGBRow_NEON; @@ -497,15 +495,15 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_stride_raw = dst_stride_argb = 0; } #if defined(HAS_RAWTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } -#elif defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToARGBRow = RAWToARGBRow_NEON; @@ -547,15 +545,15 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = dst_stride_argb = 0; } #if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } -#elif defined(HAS_RGB565TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_RGB565TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_NEON; @@ -597,15 +595,15 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = dst_stride_argb = 0; } #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB1555TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; @@ -647,15 +645,15 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = dst_stride_argb = 0; } #if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB4444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; @@ -693,17 +691,23 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } #if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_NEON; @@ -744,18 +748,23 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } #if defined(HAS_NV21TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; - } + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_NV21TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToARGBRow = NV21ToARGBRow_AVX2; } } #endif #if defined(HAS_NV21TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { NV21ToARGBRow = NV21ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { NV21ToARGBRow = NV21ToARGBRow_NEON; @@ -795,17 +804,23 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420, dst_stride_argb = -dst_stride_argb; } #if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; - } + NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_NV12TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToARGBRow = NV12ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_NEON; @@ -852,19 +867,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = dst_stride_argb = 0; } #if defined(HAS_YUY2TOARGBROW_SSSE3) - // Posix is 16, Windows is 8. - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; - } + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } -#elif defined(HAS_YUY2TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToARGBRow = YUY2ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { YUY2ToARGBRow = YUY2ToARGBRow_NEON; @@ -905,19 +924,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = dst_stride_argb = 0; } #if defined(HAS_UYVYTOARGBROW_SSSE3) - // Posix is 16, Windows is 8. - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - UYVYToARGBRow = UYVYToARGBRow_SSSE3; - } + UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } -#elif defined(HAS_UYVYTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_UYVYTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToARGBRow = UYVYToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { UYVYToARGBRow = UYVYToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { UYVYToARGBRow = UYVYToARGBRow_NEON; @@ -932,6 +955,152 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, return 0; } +// Convert J420 to ARGB. +LIBYUV_API +int J420ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*J422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = J422ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_J422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + J422ToARGBRow = J422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_J422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J422ToARGBRow = J422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J422ToARGBRow = J422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J422ToARGBRow = J422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert J422 to ARGB. +LIBYUV_API +int J422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + int y; + void (*J422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = J422ToARGBRow_C; + if (!src_y || !src_u || !src_v || + !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; + } +#if defined(HAS_J422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + J422ToARGBRow = J422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_J422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J422ToARGBRow = J422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J422ToARGBRow = J422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + J422ToARGBRow = J422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + J422ToARGBRow = J422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_J422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2; + } +#endif + + for (y = 0; y < height; ++y) { + J422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc index c1a2f62f0..b743cde26 100644 --- a/third_party/libyuv/source/convert_from.cc +++ b/third_party/libyuv/source/convert_from.cc @@ -13,7 +13,6 @@ #include "libyuv/basic_types.h" #include "libyuv/convert.h" // For I420Copy #include "libyuv/cpu_id.h" -#include "libyuv/format_conversion.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/scale.h" // For ScalePlane() @@ -174,14 +173,15 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; } #if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; @@ -220,14 +220,15 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, dst_stride_yuy2 = -dst_stride_yuy2; } #if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; @@ -280,14 +281,15 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; } #if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; @@ -326,14 +328,15 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, dst_stride_uyvy = -dst_stride_uyvy; } #if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; @@ -397,20 +400,15 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, src_stride_u = src_stride_v = dst_stride_uv = 0; } #if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; @@ -418,7 +416,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; @@ -476,18 +474,15 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; @@ -495,7 +490,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; @@ -548,23 +543,30 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, dst_stride_bgra = -dst_stride_bgra; } #if defined(HAS_I422TOBGRAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } + I422ToBGRARow = I422ToBGRARow_SSSE3; } } -#elif defined(HAS_I422TOBGRAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_AVX2; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToBGRARow = I422ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -610,17 +612,23 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, dst_stride_abgr = -dst_stride_abgr; } #if defined(HAS_I422TOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; } } -#elif defined(HAS_I422TOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TOABGRROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToABGRRow = I422ToABGRRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToABGRRow = I422ToABGRRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToABGRRow = I422ToABGRRow_NEON; @@ -664,17 +672,23 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, dst_stride_rgba = -dst_stride_rgba; } #if defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; } } -#elif defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; + } + } +#endif +#if defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_NEON; @@ -718,14 +732,15 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, dst_stride_rgb24 = -dst_stride_rgb24; } #if defined(HAS_I422TORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } -#elif defined(HAS_I422TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGB24Row = I422ToRGB24Row_NEON; @@ -769,14 +784,15 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, dst_stride_raw = -dst_stride_raw; } #if defined(HAS_I422TORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRAWRow = I422ToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRAWRow = I422ToRAWRow_SSSE3; } } -#elif defined(HAS_I422TORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToRAWRow = I422ToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRAWRow = I422ToRAWRow_NEON; @@ -820,14 +836,23 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, dst_stride_argb1555 = -dst_stride_argb1555; } #if defined(HAS_I422TOARGB1555ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } -#elif defined(HAS_I422TOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGB1555Row = I422ToARGB1555Row_NEON; @@ -872,14 +897,23 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, dst_stride_argb4444 = -dst_stride_argb4444; } #if defined(HAS_I422TOARGB4444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } -#elif defined(HAS_I422TOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGB4444Row = I422ToARGB4444Row_NEON; @@ -923,14 +957,23 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_I422TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } -#elif defined(HAS_I422TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_I422TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_NEON; @@ -1054,38 +1097,6 @@ int ConvertFromI420(const uint8* y, int y_stride, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; - case FOURCC_BGGR: - r = I420ToBayerBGGR(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); - break; - case FOURCC_GBRG: - r = I420ToBayerGBRG(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); - break; - case FOURCC_GRBG: - r = I420ToBayerGRBG(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); - break; - case FOURCC_RGGB: - r = I420ToBayerRGGB(y, y_stride, - u, u_stride, - v, v_stride, - dst_sample, - dst_sample_stride ? dst_sample_stride : width, - width, height); - break; case FOURCC_I400: r = I400Copy(y, y_stride, dst_sample, @@ -1116,7 +1127,7 @@ int ConvertFromI420(const uint8* y, int y_stride, width, height); break; } - // TODO(fbarchard): Add M420 and Q420. + // TODO(fbarchard): Add M420. // Triplanar formats // TODO(fbarchard): halfstride instead of halfwidth case FOURCC_I420: diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc index de461ddb0..dc2186a6a 100644 --- a/third_party/libyuv/source/convert_from_argb.cc +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -12,7 +12,6 @@ #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" -#include "libyuv/format_conversion.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" @@ -51,17 +50,15 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV444Row = ARGBToUV444Row_SSSE3; - } + ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToUV444Row = ARGBToUV444Row_NEON; @@ -69,19 +66,16 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -130,17 +124,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUV422Row = ARGBToUV422Row_NEON; @@ -149,18 +141,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, #endif #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -209,19 +198,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; @@ -229,7 +214,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -237,7 +222,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOUV411ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 32) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV411Row = ARGBToUV411Row_Any_NEON; if (IS_ALIGNED(width, 32)) { ARGBToUV411Row = ARGBToUV411Row_NEON; @@ -281,22 +266,17 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -304,7 +284,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; @@ -312,18 +292,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; @@ -331,7 +308,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; @@ -388,22 +365,17 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } + ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -411,7 +383,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; @@ -419,18 +391,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_MERGEUVROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { - MergeUVRow_ = MergeUVRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { - MergeUVRow_ = MergeUVRow_SSE2; - } + MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; @@ -438,7 +407,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_MERGEUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; @@ -500,17 +469,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_yuy2 = 0; } #if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUV422Row = ARGBToUV422Row_NEON; @@ -518,17 +485,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -537,14 +502,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, #endif #if defined(HAS_I422TOYUY2ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } -#elif defined(HAS_I422TOYUY2ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; @@ -602,17 +568,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_uyvy = 0; } #if defined(HAS_ARGBTOUV422ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUV422Row = ARGBToUV422Row_SSSE3; - } + ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } -#elif defined(HAS_ARGBTOUV422ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_ARGBTOUV422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUV422Row = ARGBToUV422Row_NEON; @@ -620,17 +584,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -639,14 +601,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, #endif #if defined(HAS_I422TOUYVYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } -#elif defined(HAS_I422TOUYVYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { +#endif +#if defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; @@ -697,19 +660,15 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_y = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; @@ -717,7 +676,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; @@ -773,14 +732,15 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_rgb24 = 0; } #if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } -#elif defined(HAS_ARGBTORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; @@ -820,14 +780,15 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_raw = 0; } #if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } -#elif defined(HAS_ARGBTORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRAWRow = ARGBToRAWRow_NEON; @@ -843,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, return 0; } +static const uint8 kDither8x8[64] = { + 0, 128, 32, 160, 8, 136, 40, 168, + 192, 64, 224, 96, 200, 72, 232, 104, + 48, 176, 16, 144, 56, 184, 24, 152, + 240, 112, 208, 80, 248, 120, 216, 88, + 12, 140, 44, 172, 4, 132, 36, 164, + 204, 76, 236, 108, 196, 68, 228, 100, + 60, 188, 28, 156, 52, 180, 20, 148, + 252, 124, 220, 92, 244, 116, 212, 84, +}; + +// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither8x8, int width, int height) { + int y; + void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (!dither8x8) { + dither8x8 = kDither8x8; + + } + for (y = 0; y < height; ++y) { + ARGBToRGB565DitherRow(src_argb, dst_rgb565, + dither8x8 + ((y & 7) << 3), width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + // Convert ARGB To RGB565. LIBYUV_API int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, @@ -867,15 +868,23 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_rgb565 = 0; } #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } -#elif defined(HAS_ARGBTORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB565Row = ARGBToRGB565Row_NEON; @@ -915,15 +924,23 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb1555 = 0; } #if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB1555ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB1555ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; @@ -963,15 +980,23 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb4444 = 0; } #if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } -#elif defined(HAS_ARGBTOARGB4444ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOARGB4444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; @@ -1011,23 +1036,17 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVJRow = ARGBToUVJRow_SSSE3; - if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } - } + ARGBToUVJRow = ARGBToUVJRow_SSSE3; + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; @@ -1035,7 +1054,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; @@ -1043,7 +1062,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJRow = ARGBToUVJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_NEON; @@ -1067,6 +1086,80 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, return 0; } +// ARGB little endian (bgra in memory) to J422 +LIBYUV_API +int ARGBToJ422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + int y; + void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUVJ422Row_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) = + ARGBToYJRow_C; + if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; + } +#if defined(HAS_ARGBTOUVJ422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUVJ422ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJ422Row = ARGBToUVJ422Row_NEON; + } + } +#endif + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToUVJ422Row(src_argb, dst_u, dst_v, width); + ARGBToYJRow(src_argb, dst_y, width); + src_argb += src_stride_argb; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + return 0; +} + // Convert ARGB to J400. LIBYUV_API int ARGBToJ400(const uint8* src_argb, int src_stride_argb, @@ -1091,19 +1184,15 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_yj = 0; } #if defined(HAS_ARGBTOYJROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { - ARGBToYJRow = ARGBToYJRow_SSSE3; - } + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; @@ -1111,7 +1200,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; diff --git a/third_party/libyuv/source/convert_to_argb.cc b/third_party/libyuv/source/convert_to_argb.cc index 1b228a7b4..af829fbd3 100644 --- a/third_party/libyuv/source/convert_to_argb.cc +++ b/third_party/libyuv/source/convert_to_argb.cc @@ -11,7 +11,6 @@ #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" -#include "libyuv/format_conversion.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif @@ -144,36 +143,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, crop_argb, argb_stride, crop_width, inv_crop_height); break; - // TODO(fbarchard): Support cropping Bayer by odd numbers - // by adjusting fourcc. - case FOURCC_BGGR: - src = sample + (src_width * crop_y + crop_x); - r = BayerBGGRToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - - case FOURCC_GBRG: - src = sample + (src_width * crop_y + crop_x); - r = BayerGBRGToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - - case FOURCC_GRBG: - src = sample + (src_width * crop_y + crop_x); - r = BayerGRBGToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - - case FOURCC_RGGB: - src = sample + (src_width * crop_y + crop_x); - r = BayerRGGBToARGB(src, src_width, - crop_argb, argb_stride, - crop_width, inv_crop_height); - break; - case FOURCC_I400: src = sample + src_width * crop_y + crop_x; r = I400ToARGB(src, src_width, @@ -205,15 +174,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, crop_argb, argb_stride, crop_width, inv_crop_height); break; -// case FOURCC_Q420: -// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; -// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + -// src_width + crop_x * 2; -// r = Q420ToARGB(src, src_width * 3, -// src_uv, src_width * 3, -// crop_argb, argb_stride, -// crop_width, inv_crop_height); -// break; // Triplanar formats case FOURCC_I420: case FOURCC_YU12: @@ -241,6 +201,25 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, crop_width, inv_crop_height); break; } + + case FOURCC_J420: { + const uint8* src_y = sample + (src_width * crop_y + crop_x); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (src_width + 1) / 2; + int halfheight = (abs_src_height + 1) / 2; + src_u = sample + src_width * abs_src_height + + (halfwidth * crop_y + crop_x) / 2; + src_v = sample + src_width * abs_src_height + + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; + r = J420ToARGB(src_y, src_width, + src_u, halfwidth, + src_v, halfwidth, + crop_argb, argb_stride, + crop_width, inv_crop_height); + break; + } + case FOURCC_I422: case FOURCC_YV16: { const uint8* src_y = sample + src_width * crop_y + crop_x; diff --git a/third_party/libyuv/source/convert_to_i420.cc b/third_party/libyuv/source/convert_to_i420.cc index 7b194fff7..5e75369b5 100644 --- a/third_party/libyuv/source/convert_to_i420.cc +++ b/third_party/libyuv/source/convert_to_i420.cc @@ -12,7 +12,6 @@ #include "libyuv/convert.h" -#include "libyuv/format_conversion.h" #include "libyuv/video_common.h" #ifdef __cplusplus @@ -173,40 +172,6 @@ int ConvertToI420(const uint8* sample, v, v_stride, crop_width, inv_crop_height); break; - // TODO(fbarchard): Support cropping Bayer by odd numbers - // by adjusting fourcc. - case FOURCC_BGGR: - src = sample + (src_width * crop_y + crop_x); - r = BayerBGGRToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - case FOURCC_GBRG: - src = sample + (src_width * crop_y + crop_x); - r = BayerGBRGToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - case FOURCC_GRBG: - src = sample + (src_width * crop_y + crop_x); - r = BayerGRBGToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; - case FOURCC_RGGB: - src = sample + (src_width * crop_y + crop_x); - r = BayerRGGBToI420(src, src_width, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; r = I400ToI420(src, src_width, @@ -218,7 +183,8 @@ int ConvertToI420(const uint8* sample, // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + src_uv = sample + (src_width * src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y, y_stride, @@ -228,7 +194,8 @@ int ConvertToI420(const uint8* sample, break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); - src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x; + src_uv = sample + (src_width * src_height) + + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); // Call NV12 but with u and v parameters swapped. r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, @@ -245,17 +212,6 @@ int ConvertToI420(const uint8* sample, v, v_stride, crop_width, inv_crop_height); break; - case FOURCC_Q420: - src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x; - src_uv = sample + (src_width + aligned_src_width * 2) * crop_y + - src_width + crop_x * 2; - r = Q420ToI420(src, src_width * 3, - src_uv, src_width * 3, - y, y_stride, - u, u_stride, - v, v_stride, - crop_width, inv_crop_height); - break; // Triplanar formats case FOURCC_I420: case FOURCC_YU12: diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc index 8f8a403ee..1efa26525 100644 --- a/third_party/libyuv/source/cpu_id.cc +++ b/third_party/libyuv/source/cpu_id.cc @@ -52,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { #if defined(_MSC_VER) && !defined(__clang__) #if (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); -#elif defined(_M_IX86) +#endif +#if defined(_M_IX86) __asm { mov eax, info_eax mov ecx, info_ecx @@ -98,13 +99,15 @@ int TestOsSaveYmm() { uint32 xcr0 = 0u; #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#elif defined(_M_IX86) && defined(_MSC_VER) +#endif +#if defined(_M_IX86) && defined(_MSC_VER) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. mov xcr0, eax } -#elif defined(__i386__) || defined(__x86_64__) +#endif +#if defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); #endif // defined(_MSC_VER) return((xcr0 & 6) == 6); // Is ymm saved? @@ -135,6 +138,12 @@ int ArmCpuCaps(const char* cpuinfo_name) { fclose(f); return kCpuHasNEON; } + // aarch64 uses asimd for Neon. + p = strstr(cpuinfo_line, " asimd"); + if (p && (p[6] == ' ' || p[6] == '\n')) { + fclose(f); + return kCpuHasNEON; + } } } fclose(f); @@ -240,7 +249,8 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_FMA3")) { cpu_info_ &= ~kCpuHasFMA3; } -#elif defined(__mips__) && defined(__linux__) +#endif +#if defined(__mips__) && defined(__linux__) // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. #if defined(__mips_dspr2) @@ -257,7 +267,8 @@ int InitCpuFlags(void) { if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } -#elif defined(__arm__) || defined(__aarch64__) +#endif +#if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. @@ -266,7 +277,8 @@ int InitCpuFlags(void) { // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon // flag in it. // So for aarch64, neon enabling is hard coded here. -#elif defined(__aarch64__) +#endif +#if defined(__aarch64__) cpu_info_ = kCpuHasNEON; #else // Linux arm parse text file for neon detect. diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc deleted file mode 100644 index 3c1737153..000000000 --- a/third_party/libyuv/source/format_conversion.cc +++ /dev/null @@ -1,554 +0,0 @@ -/* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/format_conversion.h" - -#include "libyuv/basic_types.h" -#include "libyuv/cpu_id.h" -#include "libyuv/video_common.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// generate a selector mask useful for pshufb -static uint32 GenerateSelector(int select0, int select1) { - return (uint32)(select0) | - (uint32)((select1 + 4) << 8) | - (uint32)((select0 + 8) << 16) | - (uint32)((select1 + 12) << 24); -} - -static int MakeSelectors(const int blue_index, - const int green_index, - const int red_index, - uint32 dst_fourcc_bayer, - uint32* index_map) { - // Now build a lookup table containing the indices for the four pixels in each - // 2x2 Bayer grid. - switch (dst_fourcc_bayer) { - case FOURCC_BGGR: - index_map[0] = GenerateSelector(blue_index, green_index); - index_map[1] = GenerateSelector(green_index, red_index); - break; - case FOURCC_GBRG: - index_map[0] = GenerateSelector(green_index, blue_index); - index_map[1] = GenerateSelector(red_index, green_index); - break; - case FOURCC_RGGB: - index_map[0] = GenerateSelector(red_index, green_index); - index_map[1] = GenerateSelector(green_index, blue_index); - break; - case FOURCC_GRBG: - index_map[0] = GenerateSelector(green_index, red_index); - index_map[1] = GenerateSelector(blue_index, green_index); - break; - default: - return -1; // Bad FourCC - } - return 0; -} - -// Converts 32 bit ARGB to Bayer RGB formats. -LIBYUV_API -int ARGBToBayer(const uint8* src_argb, int src_stride_argb, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height, - uint32 dst_fourcc_bayer) { - int y; - const int blue_index = 0; // Offsets for ARGB format - const int green_index = 1; - const int red_index = 2; - uint32 index_map[2]; - void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerRow_C; - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } -#if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; - } - } -#elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; - } - } -#endif - if (MakeSelectors(blue_index, green_index, red_index, - dst_fourcc_bayer, index_map)) { - return -1; // Bad FourCC - } - - for (y = 0; y < height; ++y) { - ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width); - src_argb += src_stride_argb; - dst_bayer += dst_stride_bayer; - } - return 0; -} - -#define AVG(a, b) (((a) + (b)) >> 1) - -static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_argb, int pix) { - const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; - uint8 g = src_bayer0[1]; - uint8 r = src_bayer1[1]; - int x; - for (x = 0; x < pix - 2; x += 2) { - dst_argb[0] = src_bayer0[0]; - dst_argb[1] = AVG(g, src_bayer0[1]); - dst_argb[2] = AVG(r, src_bayer1[1]); - dst_argb[3] = 255U; - dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]); - dst_argb[5] = src_bayer0[1]; - dst_argb[6] = src_bayer1[1]; - dst_argb[7] = 255U; - g = src_bayer0[1]; - r = src_bayer1[1]; - src_bayer0 += 2; - src_bayer1 += 2; - dst_argb += 8; - } - dst_argb[0] = src_bayer0[0]; - dst_argb[1] = AVG(g, src_bayer0[1]); - dst_argb[2] = AVG(r, src_bayer1[1]); - dst_argb[3] = 255U; - if (!(pix & 1)) { - dst_argb[4] = src_bayer0[0]; - dst_argb[5] = src_bayer0[1]; - dst_argb[6] = src_bayer1[1]; - dst_argb[7] = 255U; - } -} - -static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_argb, int pix) { - const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; - uint8 g = src_bayer0[1]; - uint8 b = src_bayer1[1]; - int x; - for (x = 0; x < pix - 2; x += 2) { - dst_argb[0] = AVG(b, src_bayer1[1]); - dst_argb[1] = AVG(g, src_bayer0[1]); - dst_argb[2] = src_bayer0[0]; - dst_argb[3] = 255U; - dst_argb[4] = src_bayer1[1]; - dst_argb[5] = src_bayer0[1]; - dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]); - dst_argb[7] = 255U; - g = src_bayer0[1]; - b = src_bayer1[1]; - src_bayer0 += 2; - src_bayer1 += 2; - dst_argb += 8; - } - dst_argb[0] = AVG(b, src_bayer1[1]); - dst_argb[1] = AVG(g, src_bayer0[1]); - dst_argb[2] = src_bayer0[0]; - dst_argb[3] = 255U; - if (!(pix & 1)) { - dst_argb[4] = src_bayer1[1]; - dst_argb[5] = src_bayer0[1]; - dst_argb[6] = src_bayer0[0]; - dst_argb[7] = 255U; - } -} - -static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_argb, int pix) { - const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; - uint8 b = src_bayer0[1]; - int x; - for (x = 0; x < pix - 2; x += 2) { - dst_argb[0] = AVG(b, src_bayer0[1]); - dst_argb[1] = src_bayer0[0]; - dst_argb[2] = src_bayer1[0]; - dst_argb[3] = 255U; - dst_argb[4] = src_bayer0[1]; - dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); - dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]); - dst_argb[7] = 255U; - b = src_bayer0[1]; - src_bayer0 += 2; - src_bayer1 += 2; - dst_argb += 8; - } - dst_argb[0] = AVG(b, src_bayer0[1]); - dst_argb[1] = src_bayer0[0]; - dst_argb[2] = src_bayer1[0]; - dst_argb[3] = 255U; - if (!(pix & 1)) { - dst_argb[4] = src_bayer0[1]; - dst_argb[5] = src_bayer0[0]; - dst_argb[6] = src_bayer1[0]; - dst_argb[7] = 255U; - } -} - -static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, - uint8* dst_argb, int pix) { - const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; - uint8 r = src_bayer0[1]; - int x; - for (x = 0; x < pix - 2; x += 2) { - dst_argb[0] = src_bayer1[0]; - dst_argb[1] = src_bayer0[0]; - dst_argb[2] = AVG(r, src_bayer0[1]); - dst_argb[3] = 255U; - dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]); - dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]); - dst_argb[6] = src_bayer0[1]; - dst_argb[7] = 255U; - r = src_bayer0[1]; - src_bayer0 += 2; - src_bayer1 += 2; - dst_argb += 8; - } - dst_argb[0] = src_bayer1[0]; - dst_argb[1] = src_bayer0[0]; - dst_argb[2] = AVG(r, src_bayer0[1]); - dst_argb[3] = 255U; - if (!(pix & 1)) { - dst_argb[4] = src_bayer1[0]; - dst_argb[5] = src_bayer0[0]; - dst_argb[6] = src_bayer0[1]; - dst_argb[7] = 255U; - } -} - -// Converts any Bayer RGB format to ARGB. -LIBYUV_API -int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int dst_stride_argb, - int width, int height, - uint32 src_fourcc_bayer) { - int y; - void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int pix); - void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int pix); - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - switch (src_fourcc_bayer) { - case FOURCC_BGGR: - BayerRow0 = BayerRowBG; - BayerRow1 = BayerRowGR; - break; - case FOURCC_GBRG: - BayerRow0 = BayerRowGB; - BayerRow1 = BayerRowRG; - break; - case FOURCC_GRBG: - BayerRow0 = BayerRowGR; - BayerRow1 = BayerRowBG; - break; - case FOURCC_RGGB: - BayerRow0 = BayerRowRG; - BayerRow1 = BayerRowGB; - break; - default: - return -1; // Bad FourCC - } - - for (y = 0; y < height - 1; y += 2) { - BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); - BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, - dst_argb + dst_stride_argb, width); - src_bayer += src_stride_bayer * 2; - dst_argb += dst_stride_argb * 2; - } - if (height & 1) { - BayerRow0(src_bayer, src_stride_bayer, dst_argb, width); - } - return 0; -} - -// Converts any Bayer RGB format to ARGB. -LIBYUV_API -int BayerToI420(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, - uint32 src_fourcc_bayer) { - void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int pix); - void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer, - uint8* dst_argb, int pix); - - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - ARGBToYRow_C; - // Negative height means invert the image. - if (height < 0) { - int halfheight; - height = -height; - halfheight = (height + 1) >> 1; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (halfheight - 1) * dst_stride_u; - dst_v = dst_v + (halfheight - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } -#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - ARGBToUVRow = ARGBToUVRow_SSSE3; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } - } -#elif defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToYRow = ARGBToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } -#endif - - switch (src_fourcc_bayer) { - case FOURCC_BGGR: - BayerRow0 = BayerRowBG; - BayerRow1 = BayerRowGR; - break; - case FOURCC_GBRG: - BayerRow0 = BayerRowGB; - BayerRow1 = BayerRowRG; - break; - case FOURCC_GRBG: - BayerRow0 = BayerRowGR; - BayerRow1 = BayerRowBG; - break; - case FOURCC_RGGB: - BayerRow0 = BayerRowRG; - BayerRow1 = BayerRowGB; - break; - default: - return -1; // Bad FourCC - } - - { - // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; - align_buffer_64(row, kRowSize * 2); - int y; - for (y = 0; y < height - 1; y += 2) { - BayerRow0(src_bayer, src_stride_bayer, row, width); - BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, - row + kRowSize, width); - ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); - src_bayer += src_stride_bayer * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - BayerRow0(src_bayer, src_stride_bayer, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); - ARGBToYRow(row, dst_y, width); - } - free_aligned_buffer_64(row); - } - return 0; -} - -// Convert I420 to Bayer. -LIBYUV_API -int I420ToBayer(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_bayer, int dst_stride_bayer, - int width, int height, - uint32 dst_fourcc_bayer) { - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; - void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerRow_C; - const int blue_index = 0; // Offsets for ARGB format - const int green_index = 1; - const int red_index = 2; - uint32 index_map[2]; - // Negative height means invert the image. - if (height < 0) { - int halfheight; - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; - } -#endif - -#if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; - } - } -#elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; - } - } -#endif - - if (MakeSelectors(blue_index, green_index, red_index, - dst_fourcc_bayer, index_map)) { - return -1; // Bad FourCC - } - { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - int y; - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row, width); - ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); - dst_bayer += dst_stride_bayer; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - free_aligned_buffer_64(row); - } - return 0; -} - -#define MAKEBAYERFOURCC(BAYER) \ -LIBYUV_API \ -int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \ - uint8* dst_y, int dst_stride_y, \ - uint8* dst_u, int dst_stride_u, \ - uint8* dst_v, int dst_stride_v, \ - int width, int height) { \ - return BayerToI420(src_bayer, src_stride_bayer, \ - dst_y, dst_stride_y, \ - dst_u, dst_stride_u, \ - dst_v, dst_stride_v, \ - width, height, \ - FOURCC_##BAYER); \ -} \ - \ -LIBYUV_API \ -int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \ - const uint8* src_u, int src_stride_u, \ - const uint8* src_v, int src_stride_v, \ - uint8* dst_bayer, int dst_stride_bayer, \ - int width, int height) { \ - return I420ToBayer(src_y, src_stride_y, \ - src_u, src_stride_u, \ - src_v, src_stride_v, \ - dst_bayer, dst_stride_bayer, \ - width, height, \ - FOURCC_##BAYER); \ -} \ - \ -LIBYUV_API \ -int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \ - uint8* dst_bayer, int dst_stride_bayer, \ - int width, int height) { \ - return ARGBToBayer(src_argb, src_stride_argb, \ - dst_bayer, dst_stride_bayer, \ - width, height, \ - FOURCC_##BAYER); \ -} \ - \ -LIBYUV_API \ -int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \ - uint8* dst_argb, int dst_stride_argb, \ - int width, int height) { \ - return BayerToARGB(src_bayer, src_stride_bayer, \ - dst_argb, dst_stride_argb, \ - width, height, \ - FOURCC_##BAYER); \ -} - -MAKEBAYERFOURCC(BGGR) -MAKEBAYERFOURCC(GBRG) -MAKEBAYERFOURCC(GRBG) -MAKEBAYERFOURCC(RGGB) - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc index 23d22d099..40ce2f787 100644 --- a/third_party/libyuv/source/mjpeg_validate.cc +++ b/third_party/libyuv/source/mjpeg_validate.cc @@ -10,15 +10,66 @@ #include "libyuv/mjpeg_decoder.h" +#include // For memchr. + #ifdef __cplusplus namespace libyuv { extern "C" { #endif +// Enable this to try scasb implementation. +// #define ENABLE_SCASB 1 + +#ifdef ENABLE_SCASB + +// Multiple of 1. +__declspec(naked) __declspec(align(16)) +const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // src + mov eax, [esp + 8] // val + mov ecx, [esp + 12] // count + repne scasb + jne sr99 + mov eax, edi + sub eax, 1 + mov edi, edx + ret + + sr99: + mov eax, 0 + mov edi, edx + ret + } +} +#endif + +// Helper function to scan for EOI marker. +static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) { + const uint8* end = sample + sample_size - 1; + const uint8* it = sample; + for (;;) { +#ifdef ENABLE_SCASB + it = ScanRow_ERMS(it, 0xff, end - it); +#else + it = static_cast(memchr(it, 0xff, end - it)); +#endif + if (it == NULL) { + break; + } + if (it[1] == 0xd9) { + return LIBYUV_TRUE; // Success: Valid jpeg. + } + ++it; // Skip over current 0xff. + } + // ERROR: Invalid jpeg end code not found. Size sample_size + return LIBYUV_FALSE; +} + // Helper function to validate the jpeg appears intact. -// TODO(fbarchard): Optimize case where SOI is found but EOI is not. LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { - size_t i; + const size_t kBackSearchSize = 1024; if (sample_size < 64) { // ERROR: Invalid jpeg size: sample_size return LIBYUV_FALSE; @@ -27,17 +78,20 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) { // ERROR: Invalid jpeg initial start code return LIBYUV_FALSE; } - for (i = sample_size - 2; i > 1;) { - if (sample[i] != 0xd9) { - if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image - return LIBYUV_TRUE; // Success: Valid jpeg. - } - --i; + // Step over SOI marker. + sample += 2; + sample_size -= 2; + + // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer. + if (sample_size > kBackSearchSize) { + if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) { + return LIBYUV_TRUE; // Success: Valid jpeg. } - --i; + // Reduce search size for forward search. + sample_size = sample_size - kBackSearchSize + 1; } - // ERROR: Invalid jpeg end code not found. Size sample_size - return LIBYUV_FALSE; + return ScanEOI(sample, sample_size); + } #ifdef __cplusplus diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc index 3857008ca..75ef775dd 100644 --- a/third_party/libyuv/source/planar_functions.cc +++ b/third_party/libyuv/source/planar_functions.cc @@ -41,16 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y, if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - CopyRow = CopyRow_SSE2; +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -59,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MIPS) @@ -90,15 +88,8 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_COPYROW_16_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_16_X86; - } -#endif #if defined(HAS_COPYROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; } #endif @@ -239,25 +230,43 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, src_stride_y = -src_stride_y; } #if defined(HAS_MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } } #endif #if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + MirrorRow = MirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - MirrorRow = MirrorRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } } #endif #if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } + } +#endif +// TODO(fbarchard): Mirror on mips handle unaligned memory. +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; } #endif @@ -298,23 +307,17 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif #if defined(HAS_YUY2TOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -324,7 +327,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, } #endif #if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; if (width >= 16) { YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; @@ -376,23 +379,17 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToYRow = UYVYToYRow_SSE2; - } - } + UYVYToUV422Row = UYVYToUV422Row_SSE2; + UYVYToYRow = UYVYToYRow_SSE2; } } #endif #if defined(HAS_UYVYTOYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -402,7 +399,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, } #endif #if defined(HAS_UYVYTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; if (width >= 16) { UYVYToUV422Row = UYVYToUV422Row_Any_NEON; @@ -497,22 +494,28 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - -#if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBMirrorRow = ARGBMirrorRow_SSSE3; +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_AVX2; - } -#endif -#if defined(HAS_ARGBMIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_NEON; + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } } #endif @@ -614,7 +617,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_ARGBMULTIPLYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBMultiplyRow = ARGBMultiplyRow_SSE2; @@ -622,7 +625,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBMULTIPLYROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBMultiplyRow = ARGBMultiplyRow_AVX2; @@ -630,7 +633,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBMULTIPLYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBMultiplyRow = ARGBMultiplyRow_NEON; @@ -680,7 +683,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAddRow = ARGBAddRow_SSE2; @@ -688,7 +691,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBADDROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBAddRow = ARGBAddRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_AVX2; @@ -696,7 +699,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBADDROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBAddRow = ARGBAddRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_NEON; @@ -741,7 +744,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_ARGBSUBTRACTROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBSubtractRow = ARGBSubtractRow_SSE2; @@ -749,7 +752,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBSUBTRACTROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_AVX2; @@ -757,7 +760,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBSUBTRACTROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBSubtractRow = ARGBSubtractRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_NEON; @@ -808,24 +811,31 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; } +#if defined(HAS_I422TOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOBGRAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToBGRARow = I422ToBGRARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToBGRARow = I422ToBGRARow_AVX2; + } + } +#endif #if defined(HAS_I422TOBGRAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToBGRARow = I422ToBGRARow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I422TOBGRAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToBGRARow = I422ToBGRARow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I422ToBGRARow = I422ToBGRARow_SSSE3; - } - } - } -#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && @@ -879,20 +889,26 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; } #if defined(HAS_I422TOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { I422ToABGRRow = I422ToABGRRow_NEON; } } -#elif defined(HAS_I422TOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { +#endif +#if defined(HAS_I422TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; - } + I422ToABGRRow = I422ToABGRRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOABGRROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToABGRRow = I422ToABGRRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToABGRRow = I422ToABGRRow_AVX2; } } #endif @@ -941,20 +957,26 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; } #if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_NEON; } } -#elif defined(HAS_I422TORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { +#endif +#if defined(HAS_I422TORGBAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - I422ToRGBARow = I422ToRGBARow_SSSE3; - } + I422ToRGBARow = I422ToRGBARow_SSSE3; + } + } +#endif +#if defined(HAS_I422TORGBAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGBARow = I422ToRGBARow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGBARow = I422ToRGBARow_AVX2; } } #endif @@ -991,14 +1013,23 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_NV12TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV12TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV12TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToRGB565Row = NV12ToRGB565Row_NEON; @@ -1039,14 +1070,23 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_NV21TORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV21TORGB565ROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { +#endif +#if defined(HAS_NV21TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB565Row = NV21ToRGB565Row_AVX2; + } + } +#endif +#if defined(HAS_NV21TORGB565ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { NV21ToRGB565Row = NV21ToRGB565Row_NEON; @@ -1070,8 +1110,12 @@ void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value) { int y; - uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); - void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; + void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C; + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } // Coalesce rows. if (dst_stride_y == width) { width *= height; @@ -1079,21 +1123,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y, dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - SetRow = SetRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + SetRow = SetRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_NEON; + } } #endif #if defined(HAS_SETROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - SetRow = SetRow_X86; + if (TestCpuFlag(kCpuHasX86)) { + SetRow = SetRow_Any_X86; + if (IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } + } +#endif +#if defined(HAS_SETROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + SetRow = SetRow_ERMS; } #endif // Set plane for (y = 0; y < height; ++y) { - SetRow(dst_y, v32, width); + SetRow(dst_y, value, width); dst_y += dst_stride_y; } } @@ -1112,7 +1165,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); if (!dst_y || !dst_u || !dst_v || - width <= 0 || height <= 0 || + width <= 0 || height == 0 || x < 0 || y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || @@ -1132,11 +1185,18 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height, uint32 value) { + int y; + void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C; if (!dst_argb || - width <= 0 || height <= 0 || + width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } dst_argb += dst_y * dst_stride_argb + dst_x * 4; // Coalesce rows. if (dst_stride_argb == width * 4) { @@ -1144,20 +1204,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); - return 0; + +#if defined(HAS_ARGBSETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSetRow = ARGBSetRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_NEON; + } } #endif -#if defined(HAS_SETROW_X86) +#if defined(HAS_ARGBSETROW_X86) if (TestCpuFlag(kCpuHasX86)) { - ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); - return 0; + ARGBSetRow = ARGBSetRow_X86; } #endif - ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height); + + // Set plane + for (y = 0; y < height; ++y) { + ARGBSetRow(dst_argb, value, width); + dst_argb += dst_stride_argb; + } return 0; } @@ -1197,9 +1263,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSE2; @@ -1207,7 +1271,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; @@ -1215,7 +1279,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; @@ -1223,7 +1287,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; @@ -1263,7 +1327,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBUNATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; @@ -1271,7 +1335,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBUNATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; @@ -1312,12 +1376,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1350,11 +1413,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } -#elif defined(HAS_ARGBGRAYROW_NEON) +#endif +#if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } @@ -1383,11 +1446,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBSEPIAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } -#elif defined(HAS_ARGBSEPIAROW_NEON) +#endif +#if defined(HAS_ARGBSEPIAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_NEON; } @@ -1425,11 +1488,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } -#elif defined(HAS_ARGBCOLORMATRIXROW_NEON) +#endif +#if defined(HAS_ARGBCOLORMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } @@ -1568,11 +1631,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, dst_stride_argb = 0; } #if defined(HAS_ARGBQUANTIZEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } -#elif defined(HAS_ARGBQUANTIZEROW_NEON) +#endif +#if defined(HAS_ARGBQUANTIZEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } @@ -1743,12 +1806,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBSHADEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_SSE2; } -#elif defined(HAS_ARGBSHADEROW_NEON) +#endif +#if defined(HAS_ARGBSHADEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_NEON; } @@ -1790,33 +1852,23 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && - IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && - IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { InterpolateRow = InterpolateRow_AVX2; @@ -1824,19 +1876,19 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 4)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 && +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2; + InterpolateRow = InterpolateRow_MIPS_DSPR2; } #endif @@ -1876,7 +1928,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_stride_bgra = dst_stride_argb = 0; } #if defined(HAS_ARGBSHUFFLEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBShuffleRow = ARGBShuffleRow_SSE2; @@ -1884,19 +1936,15 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBShuffleRow = ARGBShuffleRow_SSSE3; - } + ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif #if defined(HAS_ARGBSHUFFLEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + if (TestCpuFlag(kCpuHasAVX2)) { ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGBShuffleRow = ARGBShuffleRow_AVX2; @@ -1904,7 +1952,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_ARGBSHUFFLEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBShuffleRow = ARGBShuffleRow_Any_NEON; if (IS_ALIGNED(width, 4)) { ARGBShuffleRow = ARGBShuffleRow_NEON; @@ -1947,8 +1995,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. #if defined(HAS_ARGBTOBAYERGGROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerGGRow_SSE2; @@ -1956,8 +2003,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; @@ -1965,7 +2011,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOBAYERGGROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerGGRow_NEON; @@ -2048,8 +2094,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelRow = SobelRow_SSE2; } #endif @@ -2070,8 +2115,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_SSE2; } #endif @@ -2093,8 +2137,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_SSE2; } #endif @@ -2218,10 +2261,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif @@ -2264,10 +2304,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && - IS_ALIGNED(width, 8)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } #endif diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc index 2ef3228cb..5acaccfd8 100644 --- a/third_party/libyuv/source/rotate.cc +++ b/third_party/libyuv/source/rotate.cc @@ -42,11 +42,7 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_MIRRORROW_NEON -void MirrorRow_NEON(const uint8* src, uint8* dst, int width); -#define HAS_MIRRORROW_UV_NEON -void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSE_WX8_NEON void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width); @@ -55,7 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width); -#endif // defined(__ARM_NEON__) +#endif #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ defined(__mips__) && \ @@ -194,31 +190,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, convertloop: // Read in the data from the source pointer. // First round of bit swap. - movdqa xmm0, [eax] - movdqa xmm1, [eax + edi] + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm0 // use xmm7 as temp register. punpcklbw xmm0, xmm1 punpckhbw xmm7, xmm1 movdqa xmm1, xmm7 - movdqa xmm2, [eax] - movdqa xmm3, [eax + edi] + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm2 punpcklbw xmm2, xmm3 punpckhbw xmm7, xmm3 movdqa xmm3, xmm7 - movdqa xmm4, [eax] - movdqa xmm5, [eax + edi] + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm4 punpcklbw xmm4, xmm5 punpckhbw xmm7, xmm5 movdqa xmm5, xmm7 - movdqa xmm6, [eax] - movdqa xmm7, [eax + edi] + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] lea eax, [eax + 2 * edi] - movdqa [esp], xmm5 // backup xmm5 + movdqu [esp], xmm5 // backup xmm5 neg edi movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 @@ -239,8 +235,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, punpcklwd xmm4, xmm6 punpckhwd xmm5, xmm6 movdqa xmm6, xmm5 - movdqa xmm5, [esp] // restore xmm5 - movdqa [esp], xmm6 // backup xmm6 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 @@ -251,7 +247,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 movdqa xmm4, xmm6 - movdqa xmm6, [esp] // restore xmm6 + movdqu xmm6, [esp] // restore xmm6 movlpd qword ptr [edx], xmm0 movhpd qword ptr [ebx], xmm0 movlpd qword ptr [edx + esi], xmm4 @@ -296,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ret } } -#elif !defined(LIBYUV_DISABLE_X86) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && \ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, @@ -379,10 +376,8 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, "+r"(width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc" - #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - #endif + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -411,31 +406,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "mov 0x2c(%ecx),%ecx \n" "1: \n" - "movdqa (%eax),%xmm0 \n" - "movdqa (%eax,%edi,1),%xmm1 \n" + "movdqu (%eax),%xmm0 \n" + "movdqu (%eax,%edi,1),%xmm1 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm0,%xmm7 \n" "punpcklbw %xmm1,%xmm0 \n" "punpckhbw %xmm1,%xmm7 \n" "movdqa %xmm7,%xmm1 \n" - "movdqa (%eax),%xmm2 \n" - "movdqa (%eax,%edi,1),%xmm3 \n" + "movdqu (%eax),%xmm2 \n" + "movdqu (%eax,%edi,1),%xmm3 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm2,%xmm7 \n" "punpcklbw %xmm3,%xmm2 \n" "punpckhbw %xmm3,%xmm7 \n" "movdqa %xmm7,%xmm3 \n" - "movdqa (%eax),%xmm4 \n" - "movdqa (%eax,%edi,1),%xmm5 \n" + "movdqu (%eax),%xmm4 \n" + "movdqu (%eax,%edi,1),%xmm5 \n" "lea (%eax,%edi,2),%eax \n" "movdqa %xmm4,%xmm7 \n" "punpcklbw %xmm5,%xmm4 \n" "punpckhbw %xmm5,%xmm7 \n" "movdqa %xmm7,%xmm5 \n" - "movdqa (%eax),%xmm6 \n" - "movdqa (%eax,%edi,1),%xmm7 \n" + "movdqu (%eax),%xmm6 \n" + "movdqu (%eax,%edi,1),%xmm7 \n" "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm5,(%esp) \n" + "movdqu %xmm5,(%esp) \n" "neg %edi \n" "movdqa %xmm6,%xmm5 \n" "punpcklbw %xmm7,%xmm6 \n" @@ -455,8 +450,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "punpcklwd %xmm6,%xmm4 \n" "punpckhwd %xmm6,%xmm5 \n" "movdqa %xmm5,%xmm6 \n" - "movdqa (%esp),%xmm5 \n" - "movdqa %xmm6,(%esp) \n" + "movdqu (%esp),%xmm5 \n" + "movdqu %xmm6,(%esp) \n" "movdqa %xmm5,%xmm6 \n" "punpcklwd %xmm7,%xmm5 \n" "punpckhwd %xmm7,%xmm6 \n" @@ -465,7 +460,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "punpckldq %xmm4,%xmm0 \n" "punpckhdq %xmm4,%xmm6 \n" "movdqa %xmm6,%xmm4 \n" - "movdqa (%esp),%xmm6 \n" + "movdqu (%esp),%xmm6 \n" "movlpd %xmm0,(%edx) \n" "movhpd %xmm0,(%ebx) \n" "movlpd %xmm4,(%edx,%esi,1) \n" @@ -514,7 +509,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "ret \n" #endif ); -#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ +#endif +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 @@ -525,38 +521,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, // First round of bit swap. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3),%%xmm1 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa (%0),%%xmm2 \n" + "movdqu (%0),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm8,%%xmm9 \n" "palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqa (%0,%3),%%xmm3 \n" + "movdqu (%0,%3),%%xmm3 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm2,%%xmm10 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm10 \n" "movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm10,%%xmm11 \n" - "movdqa (%0),%%xmm4 \n" + "movdqu (%0),%%xmm4 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqa (%0,%3),%%xmm5 \n" + "movdqu (%0,%3),%%xmm5 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm4,%%xmm12 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm12 \n" "movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm12,%%xmm13 \n" - "movdqa (%0),%%xmm6 \n" + "movdqu (%0),%%xmm6 \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqa (%0,%3),%%xmm7 \n" + "movdqu (%0,%3),%%xmm7 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm6,%%xmm14 \n" "punpcklbw %%xmm7,%%xmm6 \n" @@ -666,29 +662,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, // First round of bit swap. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%4),%%xmm1 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" "movdqa %%xmm8,%%xmm1 \n" - "movdqa (%0),%%xmm2 \n" - "movdqa (%0,%4),%%xmm3 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm2,%%xmm8 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm8 \n" "movdqa %%xmm8,%%xmm3 \n" - "movdqa (%0),%%xmm4 \n" - "movdqa (%0,%4),%%xmm5 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm4,%%xmm8 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm8 \n" "movdqa %%xmm8,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa (%0,%4),%%xmm7 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm6,%%xmm8 \n" "punpcklbw %%xmm7,%%xmm6 \n" @@ -818,9 +814,7 @@ void TransposePlane(const uint8* src, int src_stride, } #endif #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { TransposeWx8 = TransposeWx8_FAST_SSSE3; } #endif @@ -883,29 +877,38 @@ void RotatePlane180(const uint8* src, int src_stride, void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_MIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + MirrorRow = MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_NEON; + } } #endif #if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - MirrorRow = MirrorRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + MirrorRow = MirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSE2; + } } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - MirrorRow = MirrorRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3)) { + MirrorRow = MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + MirrorRow = MirrorRow_SSSE3; + } } #endif #if defined(HAS_MIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - MirrorRow = MirrorRow_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { + MirrorRow = MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + MirrorRow = MirrorRow_AVX2; + } } #endif +// TODO(fbarchard): Mirror on mips handle unaligned memory. #if defined(HAS_MIRRORROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && @@ -913,21 +916,14 @@ void RotatePlane180(const uint8* src, int src_stride, MirrorRow = MirrorRow_MIPS_DSPR2; } #endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -935,6 +931,11 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = CopyRow_ERMS; } #endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; @@ -1010,13 +1011,13 @@ void TransposeUV(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; } -#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_TRANSPOSE_UVWX8_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } -#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) +#endif +#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; @@ -1084,12 +1085,13 @@ void RotateUV180(const uint8* src, int src_stride, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { MirrorRowUV = MirrorUVRow_NEON; } -#elif defined(HAS_MIRRORROW_UV_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_MIRRORROW_UV_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorRowUV = MirrorUVRow_SSSE3; } -#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) +#endif +#if defined(HAS_MIRRORUVROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { MirrorRowUV = MirrorUVRow_MIPS_DSPR2; diff --git a/third_party/libyuv/source/rotate_argb.cc b/third_party/libyuv/source/rotate_argb.cc index ab0f9ce07..b9673db15 100644 --- a/third_party/libyuv/source/rotate_argb.cc +++ b/third_party/libyuv/source/rotate_argb.cc @@ -31,7 +31,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEARGBROWDOWNEVEN_NEON void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, int src_stepx, @@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest. - IS_ALIGNED(src, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; } #endif @@ -102,38 +101,38 @@ void ARGBRotate180(const uint8* src, int src_stride, void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) = ARGBMirrorRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; -#if defined(HAS_ARGBMIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - ARGBMirrorRow = ARGBMirrorRow_SSSE3; +#if defined(HAS_ARGBMIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBMirrorRow = ARGBMirrorRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_NEON; + } + } +#endif +#if defined(HAS_ARGBMIRRORROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBMirrorRow = ARGBMirrorRow_SSE2; + } } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { - ARGBMirrorRow = ARGBMirrorRow_AVX2; - } -#endif -#if defined(HAS_ARGBMIRRORROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { - ARGBMirrorRow = ARGBMirrorRow_NEON; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - CopyRow = CopyRow_X86; + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMirrorRow = ARGBMirrorRow_AVX2; + } } #endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) && - IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX) + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -141,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = CopyRow_ERMS; } #endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc index d354e11fa..a23a40fee 100644 --- a/third_party/libyuv/source/rotate_neon.cc +++ b/third_party/libyuv/source/rotate_neon.cc @@ -17,7 +17,8 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) static uvec8 kVTbl4x4Transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; @@ -525,7 +526,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" ); } -#endif +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/rotate_neon64.cc b/third_party/libyuv/source/rotate_neon64.cc new file mode 100644 index 000000000..92358af7f --- /dev/null +++ b/third_party/libyuv/source/rotate_neon64.cc @@ -0,0 +1,543 @@ +/* + * Copyright 2014 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC Neon armv8 64 bit. +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +static uvec8 kVTbl4x4Transpose = + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %3, %3, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + "trn2 v16.8b, v0.8b, v1.8b \n" + "trn1 v17.8b, v0.8b, v1.8b \n" + "trn2 v18.8b, v2.8b, v3.8b \n" + "trn1 v19.8b, v2.8b, v3.8b \n" + "trn2 v20.8b, v4.8b, v5.8b \n" + "trn1 v21.8b, v4.8b, v5.8b \n" + "trn2 v22.8b, v6.8b, v7.8b \n" + "trn1 v23.8b, v6.8b, v7.8b \n" + + "trn2 v3.4h, v17.4h, v19.4h \n" + "trn1 v1.4h, v17.4h, v19.4h \n" + "trn2 v2.4h, v16.4h, v18.4h \n" + "trn1 v0.4h, v16.4h, v18.4h \n" + "trn2 v7.4h, v21.4h, v23.4h \n" + "trn1 v5.4h, v21.4h, v23.4h \n" + "trn2 v6.4h, v20.4h, v22.4h \n" + "trn1 v4.4h, v20.4h, v22.4h \n" + + "trn2 v21.2s, v1.2s, v5.2s \n" + "trn1 v17.2s, v1.2s, v5.2s \n" + "trn2 v20.2s, v0.2s, v4.2s \n" + "trn1 v16.2s, v0.2s, v4.2s \n" + "trn2 v23.2s, v3.2s, v7.2s \n" + "trn1 v19.2s, v3.2s, v7.2s \n" + "trn2 v22.2s, v2.2s, v6.2s \n" + "trn1 v18.2s, v2.2s, v6.2s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v17.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v21.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v20.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v23.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v22.8b}, [%0] \n" + + "add %1, %1, #8 \n" // src += 8 + "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride + "subs %3, %3, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %3, %3, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %3, #2 \n" + "b.lt 3f \n" + + "cmp %3, #4 \n" + "b.lt 2f \n" + + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.s}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.s}[3], [%0] \n" + + "mov %0, %2 \n" + + MEMACCESS(4) + "ld1 {v2.16b}, [%4] \n" + + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" + + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + MEMACCESS(0) + "st1 {v3.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v3.s}[3], [%0] \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v0.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v0.s}[3], [%0] \n" + + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %3, %3, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %3, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v0.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.h}[3], [%0] \n" + + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v3.8b}, [%0], %6 \n" + MEMACCESS(0) + "st1 {v2.8b}, [%0] \n" + + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %3, %3, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld1 {v0.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld1 {v0.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.8b}, [%2] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width64) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23" + ); +} + +static uint8 kVTbl4x4TransposeDi[32] = + { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, + 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; + +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + const uint8* src_temp = NULL; + int64 width64 = (int64) width; // Work around clang 3.4 warning. + asm volatile ( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %4, %4, #8 \n" + + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" + + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.16b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.16b}, [%0] \n" + + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" + + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" + + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v17.d}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v19.d}[1], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v20.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v20.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v22.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v21.d}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v23.d}[1], [%0] \n" + + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %4, %4, #8 \n" // w -= 8 + "b.ge 1b \n" + + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %4, %4, #8 \n" + "b.eq 4f \n" + + // some residual, so between 1 and 7 lines left to transpose + "cmp %4, #2 \n" + "b.lt 3f \n" + + "cmp %4, #4 \n" + "b.lt 2f \n" + + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + MEMACCESS(0) + "ld1 {v0.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v1.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v2.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v3.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v4.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v5.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v6.8b}, [%0], %5 \n" + MEMACCESS(0) + "ld1 {v7.8b}, [%0] \n" + + MEMACCESS(8) + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" + + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v16.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v16.s}[3], [%0], %6 \n" + + "add %0, %2, #4 \n" + MEMACCESS(0) + "st1 {v18.s}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[1], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[2], [%0], %6 \n" + MEMACCESS(0) + "st1 {v18.s}[3], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v17.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v17.s}[3], [%0], %7 \n" + + "add %0, %3, #4 \n" + MEMACCESS(0) + "st1 {v19.s}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[1], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[2], [%0], %7 \n" + MEMACCESS(0) + "st1 {v19.s}[3], [%0] \n" + + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %4, %4, #4 \n" // w -= 4 + "b.eq 4f \n" + + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %4, #2 \n" + "b.lt 3f \n" + + // 2x8 block + "2: \n" + "mov %0, %1 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + MEMACCESS(0) + "ld2 {v2.h, v3.h}[3], [%0] \n" + + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" + + "mov %0, %2 \n" + + MEMACCESS(0) + "st1 {v4.d}[0], [%0], %6 \n" + MEMACCESS(0) + "st1 {v6.d}[0], [%0] \n" + + "mov %0, %3 \n" + + MEMACCESS(0) + "st1 {v5.d}[0], [%0], %7 \n" + MEMACCESS(0) + "st1 {v7.d}[0], [%0] \n" + + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %4, %4, #2 \n" // w -= 2 + "b.eq 4f \n" + + // 1x8 block + "3: \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + MEMACCESS(1) + "ld2 {v0.b, v1.b}[7], [%1] \n" + + MEMACCESS(2) + "st1 {v0.d}[0], [%2] \n" + MEMACCESS(3) + "st1 {v1.d}[0], [%3] \n" + + "4: \n" + + : "+r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width64) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v30", "v31" + ); +} +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index ce8b3dad1..19340b3b7 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -17,17 +17,14 @@ namespace libyuv { extern "C" { #endif -// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels. -// TODO(fbarchard): Consider 'any' functions handling odd alignment. // YUV to RGB does multiple of 8 with SIMD and remainder with C. #define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \ - void NAMEANY(const uint8* y_buf, \ - const uint8* u_buf, \ - const uint8* v_buf, \ - uint8* rgb_buf, \ - int width) { \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + uint8* rgb_buf, int width) { \ int n = width & ~MASK; \ - I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ + if (n > 0) { \ + I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ + } \ I420TORGB_C(y_buf + n, \ u_buf + (n >> UV_SHIFT), \ v_buf + (n >> UV_SHIFT), \ @@ -35,36 +32,59 @@ extern "C" { } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C, 1, 4, 7) -#endif // HAS_I422TOARGBROW_SSSE3 +#endif #ifdef HAS_I444TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C, 0, 4, 7) -YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C, 2, 4, 7) -YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C, 1, 4, 7) -YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C, 1, 4, 7) -YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C, 1, 4, 7) -// I422ToRGB565Row_SSSE3 is unaligned. YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, 1, 2, 7) YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C, 1, 2, 7) -// I422ToRGB24Row_SSSE3 is unaligned. YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) #endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_J422TOARGBROW_SSSE3 +YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C, + 1, 4, 7) +#endif #ifdef HAS_I422TOARGBROW_AVX2 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) -#endif // HAS_I422TOARGBROW_AVX2 +#endif +#ifdef HAS_I422TOBGRAROW_AVX2 +YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15) +#endif +#ifdef HAS_I422TORGBAROW_AVX2 +YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15) +#endif +#ifdef HAS_I422TOABGRROW_AVX2 +YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15) +#endif +#ifdef HAS_I422TOARGB4444ROW_AVX2 +YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C, + 1, 2, 7) +#endif +#ifdef HAS_I422TOARGB1555ROW_AVX2 +YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C, + 1, 2, 7) +#endif +#ifdef HAS_I422TORGB565ROW_AVX2 +YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C, + 1, 2, 7) +#endif #ifdef HAS_I422TOARGBROW_NEON YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) @@ -79,214 +99,240 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) -#endif // HAS_I422TOARGBROW_NEON +#endif #ifdef HAS_I422TOYUY2ROW_NEON YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) -#endif // HAS_I422TOYUY2ROW_NEON +#endif #ifdef HAS_I422TOUYVYROW_NEON YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) -#endif // HAS_I422TOUYVYROW_NEON +#endif #undef YANY // Wrappers to handle odd width -#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \ - void NAMEANY(const uint8* y_buf, \ - const uint8* uv_buf, \ - uint8* rgb_buf, \ - int width) { \ - int n = width & ~7; \ - NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ +#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ + uint8* rgb_buf, int width) { \ + int n = width & ~MASK; \ + if (n > 0) { \ + NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ + } \ NV12TORGB_C(y_buf + n, \ uv_buf + (n >> UV_SHIFT), \ - rgb_buf + n * BPP, width & 7); \ + rgb_buf + n * BPP, width & MASK); \ } #ifdef HAS_NV12TOARGBROW_SSSE3 -NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, - 0, 4) -NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, - 0, 4) -#endif // HAS_NV12TOARGBROW_SSSE3 +NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7) +NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7) +#endif +#ifdef HAS_NV12TOARGBROW_AVX2 +NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15) +NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15) +#endif #ifdef HAS_NV12TOARGBROW_NEON -NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4) -NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4) -#endif // HAS_NV12TOARGBROW_NEON +NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7) +NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, - 0, 2) + 0, 2, 7) NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, - 0, 2) -#endif // HAS_NV12TORGB565ROW_SSSE3 + 0, 2, 7) +#endif +#ifdef HAS_NV12TORGB565ROW_AVX2 +NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C, + 0, 2, 15) +NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C, + 0, 2, 15) +#endif #ifdef HAS_NV12TORGB565ROW_NEON -NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2) -NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) -#endif // HAS_NV12TORGB565ROW_NEON +NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, + 0, 2, 7) +NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, + 0, 2, 7) +#endif #undef NVANY -#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ - void NAMEANY(const uint8* src, \ - uint8* dst, \ - int width) { \ +#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src, uint8* dst, int width) { \ int n = width & ~MASK; \ - ARGBTORGB_SIMD(src, dst, n); \ + if (n > 0) { \ + ARGBTORGB_SIMD(src, dst, n); \ + } \ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \ } #if defined(HAS_ARGBTORGB24ROW_SSSE3) RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C, - 15, 4, 3) + 4, 3, 15) RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C, - 15, 4, 3) + 4, 3, 15) RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C, - 3, 4, 2) + 4, 2, 3) RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, - 3, 4, 2) + 4, 2, 3) RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, - 3, 4, 2) + 4, 2, 3) #endif +#if defined(HAS_ARGBTOARGB4444ROW_AVX2) +RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C, + 4, 2, 7) +RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C, + 4, 2, 7) +RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C, + 4, 2, 7) +#endif + #if defined(HAS_I400TOARGBROW_SSE2) -RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, - 7, 1, 4) +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7) #endif #if defined(HAS_YTOARGBROW_SSE2) -RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, - 7, 1, 4) -RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, - 15, 2, 4) -RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, - 15, 2, 4) -// These require alignment on ARGB, so C is used for remainder. +RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7) +#endif +#if defined(HAS_YTOARGBROW_AVX2) +RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, 1, 4, 15) +#endif +#if defined(HAS_YUY2TOARGBROW_SSSE3) +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 2, 4, 15) +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 2, 4, 15) RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, - 15, 3, 4) -RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, - 15, 3, 4) + 3, 4, 15) +RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, 3, 4, 15) RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C, - 7, 2, 4) + 2, 4, 7) RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, - 7, 2, 4) + 2, 4, 7) RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, - 7, 2, 4) + 2, 4, 7) +#endif +#if defined(HAS_YUY2TOARGBROW_AVX2) +RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31) +RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) -RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) -RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 4, 3, 7) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 4, 3, 7) RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C, - 7, 4, 2) + 4, 2, 7) RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, - 7, 4, 2) + 4, 2, 7) RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, - 7, 4, 2) -RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, - 7, 1, 4) -RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, - 7, 1, 4) -RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, - 7, 2, 4) -RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, - 7, 2, 4) + 4, 2, 7) +RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 1, 4, 7) +RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, 1, 4, 7) +RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 2, 4, 7) +RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7) #endif #undef RGBANY // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. -#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ - void NAMEANY(const uint8* src, \ - uint8* dst, uint32 selector, \ - int width) { \ +#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \ int n = width & ~MASK; \ - ARGBTORGB_SIMD(src, dst, selector, n); \ + if (n > 0) { \ + ARGBTORGB_SIMD(src, dst, selector, n); \ + } \ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \ } -#if defined(HAS_ARGBTOBAYERROW_SSSE3) -BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, - 7, 4, 1) -#endif -#if defined(HAS_ARGBTOBAYERROW_NEON) -BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, - 7, 4, 1) -#endif #if defined(HAS_ARGBTOBAYERGGROW_SSE2) BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, - 7, 4, 1) + 4, 1, 7) #endif #if defined(HAS_ARGBTOBAYERGGROW_NEON) BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, - 7, 4, 1) + 4, 1, 7) #endif #undef BAYERANY -// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. -#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \ - void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ - ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ - ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \ - dst_y + (width - NUM) * BPP, NUM); \ - } - -#ifdef HAS_ARGBTOYROW_AVX2 -YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) -YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32) -YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) -YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) -#endif -#ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) -#endif -#ifdef HAS_BGRATOYROW_SSSE3 -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) -#endif -#ifdef HAS_ARGBTOYJROW_SSSE3 -YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) -#endif -#ifdef HAS_ARGBTOYROW_NEON -YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) -YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) -YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) -YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) -YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) -YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) -YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) -YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) -YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) -YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) -#endif -#ifdef HAS_YUY2TOYROW_NEON -YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) -#endif -#ifdef HAS_UYVYTOYROW_NEON -YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) -#endif -#ifdef HAS_RGB24TOARGBROW_NEON -YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) -#endif -#ifdef HAS_RAWTOARGBROW_NEON -YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) -#endif -#ifdef HAS_RGB565TOARGBROW_NEON -YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) -#endif -#ifdef HAS_ARGB1555TOARGBROW_NEON -YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) -#endif -#ifdef HAS_ARGB4444TOARGBROW_NEON -YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) -#endif -#undef YANY - #define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ int n = width & ~MASK; \ - ARGBTOY_SIMD(src_argb, dst_y, n); \ + if (n > 0) { \ + ARGBTOY_SIMD(src_argb, dst_y, n); \ + } \ ARGBTOY_C(src_argb + n * SBPP, \ dst_y + n * BPP, width & MASK); \ } - -// Attenuate is destructive so last16 method can not be used due to overlap. +#ifdef HAS_ARGBTOYROW_AVX2 +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYJROW_AVX2 +YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_AVX2 +YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31) +#endif +#ifdef HAS_YUY2TOYROW_AVX2 +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_SSSE3 +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, BGRAToYRow_C, 4, 1, 15) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, ABGRToYRow_C, 4, 1, 15) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, RGBAToYRow_C, 4, 1, 15) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, YUY2ToYRow_C, 2, 1, 15) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, UYVYToYRow_C, 2, 1, 15) +#endif +#ifdef HAS_ARGBTOYJROW_SSSE3 +YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, ARGBToYJRow_C, 4, 1, 15) +#endif +#ifdef HAS_ARGBTOYROW_NEON +YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, ARGBToYRow_C, 4, 1, 7) +#endif +#ifdef HAS_ARGBTOYJROW_NEON +YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, ARGBToYJRow_C, 4, 1, 7) +#endif +#ifdef HAS_BGRATOYROW_NEON +YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, BGRAToYRow_C, 4, 1, 7) +#endif +#ifdef HAS_ABGRTOYROW_NEON +YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, ABGRToYRow_C, 4, 1, 7) +#endif +#ifdef HAS_RGBATOYROW_NEON +YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, RGBAToYRow_C, 4, 1, 7) +#endif +#ifdef HAS_RGB24TOYROW_NEON +YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, RGB24ToYRow_C, 3, 1, 7) +#endif +#ifdef HAS_RAWTOYROW_NEON +YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, RAWToYRow_C, 3, 1, 7) +#endif +#ifdef HAS_RGB565TOYROW_NEON +YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, RGB565ToYRow_C, 2, 1, 7) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON +YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, ARGB1555ToYRow_C, 2, 1, 7) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON +YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, ARGB4444ToYRow_C, 2, 1, 7) +#endif +#ifdef HAS_YUY2TOYROW_NEON +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, YUY2ToYRow_C, 2, 1, 15) +#endif +#ifdef HAS_UYVYTOYROW_NEON +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, UYVYToYRow_C, 2, 1, 15) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON +YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, RGB24ToARGBRow_C, 3, 4, 7) +#endif +#ifdef HAS_RAWTOARGBROW_NEON +YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, RAWToARGBRow_C, 3, 4, 7) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON +YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, RGB565ToARGBRow_C, 2, 4, 7) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON +YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, ARGB1555ToARGBRow_C, + 2, 4, 7) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON +YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, ARGB4444ToARGBRow_C, + 2, 4, 7) +#endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, 4, 4, 3) @@ -318,7 +364,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, void NAMEANY(const uint8* src_argb, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~MASK; \ - ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ + if (n > 0) { \ + ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ + } \ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ dst_u + (n >> 1), \ dst_v + (n >> 1), \ @@ -327,29 +375,50 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, #ifdef HAS_ARGBTOUVROW_AVX2 UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) +#endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15) +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_YUY2TOUVROW_AVX2 UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) #endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) -UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, - 4, 15) -UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) -UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) -UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) -UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15) -UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) +#ifdef HAS_YUY2TOUVROW_SSE2 +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_NEON UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) #endif #ifdef HAS_YUY2TOUVROW_NEON @@ -360,11 +429,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY -#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \ - void NAMEANY(const uint8* src_uv, \ - uint8* dst_u, uint8* dst_v, int width) { \ +#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, SHIFT, MASK) \ + void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~MASK; \ - ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + if (n > 0) { \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + } \ ANYTOUV_C(src_uv + n * BPP, \ dst_u + (n >> SHIFT), \ dst_v + (n >> SHIFT), \ @@ -372,42 +442,45 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) } #ifdef HAS_ARGBTOUV444ROW_SSSE3 -UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, - ARGBToUV444Row_C, 4, 15, 0) +UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, + ARGBToUV444Row_C, 4, 0, 15) #endif #ifdef HAS_YUY2TOUV422ROW_AVX2 UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, - YUY2ToUV422Row_C, 2, 31, 1) + YUY2ToUV422Row_C, 2, 1, 31) UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, - UYVYToUV422Row_C, 2, 31, 1) + UYVYToUV422Row_C, 2, 1, 31) #endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, - ARGBToUV422Row_C, 4, 15, 1) -UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, - YUY2ToUV422Row_C, 2, 15, 1) -UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, - UYVYToUV422Row_C, 2, 15, 1) +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, + ARGBToUV422Row_C, 4, 1, 15) +#endif +#ifdef HAS_YUY2TOUV422ROW_SSE2 +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, + YUY2ToUV422Row_C, 2, 1, 15) +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, + UYVYToUV422Row_C, 2, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, - ARGBToUV444Row_C, 4, 7, 0) + ARGBToUV444Row_C, 4, 0, 7) UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, - ARGBToUV422Row_C, 4, 15, 1) + ARGBToUV422Row_C, 4, 1, 15) UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, - ARGBToUV411Row_C, 4, 31, 2) + ARGBToUV411Row_C, 4, 2, 31) UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, - YUY2ToUV422Row_C, 2, 15, 1) + YUY2ToUV422Row_C, 2, 1, 15) UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, - UYVYToUV422Row_C, 2, 15, 1) + UYVYToUV422Row_C, 2, 1, 15) #endif #undef UV422ANY #define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ - void NAMEANY(const uint8* src_uv, \ - uint8* dst_u, uint8* dst_v, int width) { \ + void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~MASK; \ - ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + if (n > 0) { \ + ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + } \ ANYTOUV_C(src_uv + n * 2, \ dst_u + n, \ dst_v + n, \ @@ -415,7 +488,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, } #ifdef HAS_SPLITUVROW_SSE2 -SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) +SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) @@ -424,7 +497,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_MIPS_DSPR2 -SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, +SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, SplitUVRow_C, 15) #endif #undef SPLITUVROWANY @@ -433,7 +506,9 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, void NAMEANY(const uint8* src_u, const uint8* src_v, \ uint8* dst_uv, int width) { \ int n = width & ~MASK; \ - ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \ + if (n > 0) { \ + ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \ + } \ ANYTOUV_C(src_u + n, \ src_v + n, \ dst_uv + n * 2, \ @@ -441,7 +516,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, } #ifdef HAS_MERGEUVROW_SSE2 -MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) +MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) @@ -455,7 +530,9 @@ MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ uint8* dst_argb, int width) { \ int n = width & ~MASK; \ - ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \ + if (n > 0) { \ + ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \ + } \ ARGBMATH_C(src_argb0 + n * 4, \ src_argb1 + n * 4, \ dst_argb + n * 4, \ @@ -502,7 +579,9 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, void NAMEANY(const uint8* src_argb, uint8* dst_argb, \ const uint8* shuffler, int width) { \ int n = width & ~MASK; \ - ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + if (n > 0) { \ + ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + } \ ARGBTOY_C(src_argb + n * SBPP, \ dst_argb + n * BPP, shuffler, width & MASK); \ } @@ -512,7 +591,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, ARGBShuffleRow_C, 4, 4, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 -YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, ARGBShuffleRow_C, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 @@ -531,35 +610,107 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, ptrdiff_t src_stride_ptr, int width, \ int source_y_fraction) { \ int n = width & ~MASK; \ - TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \ - n, source_y_fraction); \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + } \ TERP_C(dst_ptr + n * BPP, \ src_ptr + n * SBPP, src_stride_ptr, \ width & MASK, source_y_fraction); \ } #ifdef HAS_INTERPOLATEROW_AVX2 -NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, - InterpolateRow_C, 1, 1, 32) +NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, - InterpolateRow_C, 1, 1, 15) +NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_SSE2 -NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2, - InterpolateRow_C, 1, 1, 15) +NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_NEON -NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, - InterpolateRow_C, 1, 1, 15) +NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 -NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, - InterpolateRow_C, 1, 1, 3) +NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C, + 1, 1, 3) #endif #undef NANY +#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \ + void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + int r = width & MASK; \ + if (n > 0) { \ + MIRROR_SIMD(src_y, dst_y + r * BPP, n); \ + } \ + MIRROR_C(src_y + n * BPP, dst_y, r); \ + } + +#ifdef HAS_MIRRORROW_AVX2 +MANY(MirrorRow_Any_AVX2, MirrorRow_AVX2, MirrorRow_C, 1, 31) +#endif +#ifdef HAS_MIRRORROW_SSSE3 +MANY(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, MirrorRow_C, 1, 15) +#endif +#ifdef HAS_MIRRORROW_SSE2 +MANY(MirrorRow_Any_SSE2, MirrorRow_SSE2, MirrorRow_C, 1, 15) +#endif +#ifdef HAS_MIRRORROW_NEON +MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15) +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 +MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7) +#endif +#ifdef HAS_ARGBMIRRORROW_SSE2 +MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_NEON +MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3) +#endif +#undef MANY + +#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \ + void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + int r = width & MASK; \ + if (n > 0) { \ + COPY_SIMD(src_y, dst_y, n); \ + } \ + COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \ + } + +#ifdef HAS_COPYROW_AVX +MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31) +#endif +#undef MANY + +#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \ + void NAMEANY(uint8* dst_y, T v8, int width) { \ + int n = width & ~MASK; \ + int r = width & MASK; \ + if (n > 0) { \ + SET_SIMD(dst_y, v8, n); \ + } \ + SET_C(dst_y + n * BPP, v8, r); \ + } + +#ifdef HAS_SETROW_X86 +SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3) +#endif +#ifdef HAS_SETROW_NEON +SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15) +#endif +#ifdef HAS_ARGBSETROW_NEON +SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3) +#endif +#undef SETANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc index fa2b752a2..e0e2bf426 100644 --- a/third_party/libyuv/source/row_common.cc +++ b/third_party/libyuv/source/row_common.cc @@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + int dither0 = dither8x8[x & 7] - 128; + int dither1 = dither8x8[(x & 7) + 1] - 128; + uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; + uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; + uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; + uint8 b1 = Clamp(src_argb[4] + dither1) >> 3; + uint8 g1 = Clamp(src_argb[5] + dither1) >> 2; + uint8 r1 = Clamp(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + int dither0 = dither8x8[(width - 1) & 7] - 128; + uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; + uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; + uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -385,6 +411,28 @@ void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ +void ARGBToUVJ422Row_C(const uint8* src_argb, + uint8* dst_u, uint8* dst_v, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 ab = (src_argb[0] + src_argb[4]) >> 1; + uint8 ag = (src_argb[1] + src_argb[5]) >> 1; + uint8 ar = (src_argb[2] + src_argb[6]) >> 1; + dst_u[0] = RGBToUJ(ar, ag, ab); + dst_v[0] = RGBToVJ(ar, ag, ab); + src_argb += 8; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 ab = src_argb[0]; + uint8 ag = src_argb[1]; + uint8 ar = src_argb[2]; + dst_u[0] = RGBToUJ(ar, ag, ab); + dst_v[0] = RGBToVJ(ar, ag, ab); + } +} + void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) { int x; for (x = 0; x < width; ++x) { @@ -938,33 +986,52 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { } } +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + // C reference code that mimics the YUV assembly. - -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ - -#define UB 127 /* min(63,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ -#define UR 0 - -#define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ - -// Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 - static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* b, uint8* g, uint8* r) { - int32 y1 = ((int32)(y) - 16) * YG; - *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6); - *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6); - *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6); + uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32)(BB - ( u * UB) + y1) >> 6); + *g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6); + *r = Clamp((int32)(BR - (v * VR ) + y1) >> 6); } +// C reference code that mimics the YUV assembly. +static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; + *b = Clamp((int32)(y1 - YGB) >> 6); + *g = Clamp((int32)(y1 - YGB) >> 6); + *r = Clamp((int32)(y1 - YGB) >> 6); +} + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + #if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) + (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. // TODO(fbarchard): Remove subsampling from Neon. void I444ToARGBRow_C(const uint8* src_y, @@ -1008,6 +1075,7 @@ void I444ToARGBRow_C(const uint8* src_y, } } #endif + // Also used for 420 void I422ToARGBRow_C(const uint8* src_y, const uint8* src_u, @@ -1034,6 +1102,59 @@ void I422ToARGBRow_C(const uint8* src_y, } } +// C reference code that mimics the YUV assembly. +// * R = Y + 1.40200 * Cr +// * G = Y - 0.34414 * Cb - 0.71414 * Cr +// * B = Y + 1.77200 * Cb + +#define YGJ 64 /* (int8)round(1.000 * 64) */ + +#define UBJ 113 /* (int8)round(1.772 * 64) */ +#define UGJ -22 /* (int8)round(-0.34414 * 64) */ +#define URJ 0 + +#define VBJ 0 +#define VGJ -46 /* (int8)round(-0.71414 * 64) */ +#define VRJ 90 /* (int8)round(1.402 * 64) */ + +// Bias +#define BBJ (UBJ * 128 + VBJ * 128) +#define BGJ (UGJ * 128 + VGJ * 128) +#define BRJ (URJ * 128 + VRJ * 128) + +static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * YGJ); + *b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6); + *g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6); + *r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6); +} + +void J422ToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvJPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + YuvJPixel(src_y[1], src_u[0], src_v[0], + rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + rgb_buf[7] = 255; + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvJPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + rgb_buf[3] = 255; + } +} + void I422ToRGB24Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1470,18 +1591,15 @@ void I422ToRGBARow_C(const uint8* src_y, void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], 128, 128, - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf[3] = 255; - YuvPixel(src_y[1], 128, 128, - rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], 128, 128, - rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf[3] = 255; } } @@ -1569,28 +1687,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) { memcpy(dst, src, count * 2); } -void SetRow_C(uint8* dst, uint32 v8, int count) { -#ifdef _MSC_VER - // VC will generate rep stosb. - int x; - for (x = 0; x < count; ++x) { - dst[x] = v8; - } -#else - memset(dst, v8, count); -#endif +void SetRow_C(uint8* dst, uint8 v8, int width) { + memset(dst, v8, width); } -void ARGBSetRows_C(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - int y; - for (y = 0; y < height; ++y) { - uint32* d = (uint32*)(dst); - int x; - for (x = 0; x < width; ++x) { - d[x] = v32; - } - dst += dst_stride; +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { + uint32* d = (uint32*)(dst_argb); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; } } @@ -1885,17 +1990,17 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } } -// Blend 2 rows into 1 for conversions such as I422ToI420. -void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { +// Blend 2 rows into 1. +static void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { int x; for (x = 0; x < pix; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } -void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, - uint16* dst_uv, int pix) { +static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, + uint16* dst_uv, int pix) { int x; for (x = 0; x < pix; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; @@ -1957,24 +2062,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, } } -// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG -void ARGBToBayerRow_C(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { - int index0 = selector & 0xff; - int index1 = (selector >> 8) & 0xff; - // Copy a row of Bayer. - int x; - for (x = 0; x < pix - 1; x += 2) { - dst_bayer[0] = src_argb[index0]; - dst_bayer[1] = src_argb[index1]; - src_argb += 8; - dst_bayer += 2; - } - if (pix & 1) { - dst_bayer[0] = src_argb[index0]; - } -} - // Select G channel from ARGB. e.g. GGGGGGGG void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -2061,122 +2148,272 @@ void I422ToUYVYRow_C(const uint8* src_y, } } -#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) +// Maximum temporary width for wrappers to process at a time, in pixels. +#define MAXTWIDTH 2048 + +#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. -#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* rgb_buf, + uint8* dst_rgb565, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToRGB565Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } } -#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) +#endif -#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +#if defined(HAS_I422TOARGB1555ROW_SSSE3) void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* rgb_buf, + uint8* dst_argb1555, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToARGB1555Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } } +#endif +#if defined(HAS_I422TOARGB4444ROW_SSSE3) void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* rgb_buf, + uint8* dst_argb4444, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToARGB4444Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } } +#endif -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - NV12ToARGBRow_SSSE3(src_y, src_uv, row, width); - ARGBToRGB565Row_SSE2(row, dst_rgb565, width); - free_aligned_buffer_64(row); +#if defined(HAS_NV12TORGB565ROW_SSSE3) +void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } } +#endif -void NV21ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_rgb565, - int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); - ARGBToRGB565Row_SSE2(row, dst_rgb565, width); - free_aligned_buffer_64(row); +#if defined(HAS_NV21TORGB565ROW_SSSE3) +void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } } +#endif -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width); - YUY2ToYRow_SSE2(src_yuy2, row_y, width); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +#if defined(HAS_YUY2TOARGBROW_SSSE3) +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth); + YUY2ToYRow_SSE2(src_yuy2, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_yuy2 += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } } +#endif -void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width); - YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +#if defined(HAS_UYVYTOARGBROW_SSSE3) +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth); + UYVYToYRow_SSE2(src_uyvy, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_uyvy += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } } +#endif // !defined(LIBYUV_DISABLE_X86) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width); - UYVYToYRow_SSE2(src_uyvy, row_y, width); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +#if defined(HAS_I422TORGB565ROW_AVX2) +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } } +#endif -void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width); - UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); - I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +#if defined(HAS_I422TOARGB1555ROW_AVX2) +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } } +#endif -#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +#if defined(HAS_I422TOARGB4444ROW_AVX2) +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_AVX2) +void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB565ROW_AVX2) +void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_YUY2TOARGBROW_AVX2) +void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth); + YUY2ToYRow_AVX2(src_yuy2, row_y, twidth); + I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); + src_yuy2 += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} +#endif + +#if defined(HAS_UYVYTOARGBROW_AVX2) +void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth); + UYVYToYRow_AVX2(src_uyvy, row_y, twidth); + I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth); + src_uyvy += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } +} #endif // !defined(LIBYUV_DISABLE_X86) void ARGBPolynomialRow_C(const uint8* src_argb, diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc index ae9370c1b..cfc9ffe03 100644 --- a/third_party/libyuv/source/row_mips.cc +++ b/third_party/libyuv/source/row_mips.cc @@ -378,7 +378,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { // MIPS DSPR2 functions #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \ (__mips_dsp_rev >= 2) && \ - (_MIPS_SIM == _MIPS_SIM_ABI32) + (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { @@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ); } -void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, - uint8* dst_v, int width) { - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - "srl $t4, %[width], 4 \n" // multiplies of 16 - "blez $t4, 2f \n" - " andi %[width], %[width], 0xf \n" // residual - - ".p2align 2 \n" - "1: \n" - "addiu $t4, $t4, -1 \n" - "lwr $t0, 0(%[src_uv]) \n" - "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0 - "lwr $t1, 4(%[src_uv]) \n" - "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2 - "lwr $t2, 8(%[src_uv]) \n" - "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4 - "lwr $t3, 12(%[src_uv]) \n" - "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6 - "lwr $t5, 16(%[src_uv]) \n" - "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8 - "lwr $t6, 20(%[src_uv]) \n" - "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10 - "lwr $t7, 24(%[src_uv]) \n" - "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12 - "lwr $t8, 28(%[src_uv]) \n" - "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14 - "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0 - "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0 - "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4 - "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4 - "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8 - "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8 - "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12 - "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12 - "addiu %[src_uv], %[src_uv], 32 \n" - "swr $t9, 0(%[dst_v]) \n" - "swl $t9, 3(%[dst_v]) \n" - "swr $t0, 0(%[dst_u]) \n" - "swl $t0, 3(%[dst_u]) \n" - "swr $t1, 4(%[dst_v]) \n" - "swl $t1, 7(%[dst_v]) \n" - "swr $t2, 4(%[dst_u]) \n" - "swl $t2, 7(%[dst_u]) \n" - "swr $t3, 8(%[dst_v]) \n" - "swl $t3, 11(%[dst_v]) \n" - "swr $t5, 8(%[dst_u]) \n" - "swl $t5, 11(%[dst_u]) \n" - "swr $t6, 12(%[dst_v]) \n" - "swl $t6, 15(%[dst_v]) \n" - "swr $t7, 12(%[dst_u]) \n" - "swl $t7, 15(%[dst_u]) \n" - "addiu %[dst_u], %[dst_u], 16 \n" - "bgtz $t4, 1b \n" - " addiu %[dst_v], %[dst_v], 16 \n" - - "beqz %[width], 3f \n" - " nop \n" - - "2: \n" - "lbu $t0, 0(%[src_uv]) \n" - "lbu $t1, 1(%[src_uv]) \n" - "addiu %[src_uv], %[src_uv], 2 \n" - "addiu %[width], %[width], -1 \n" - "sb $t0, 0(%[dst_u]) \n" - "sb $t1, 0(%[dst_v]) \n" - "addiu %[dst_u], %[dst_u], 1 \n" - "bgtz %[width], 2b \n" - " addiu %[dst_v], %[dst_v], 1 \n" - - "3: \n" - ".set pop \n" - : [src_uv] "+r" (src_uv), - [width] "+r" (width), - [dst_u] "+r" (dst_u), - [dst_v] "+r" (dst_v) - : - : "t0", "t1", "t2", "t3", - "t4", "t5", "t6", "t7", "t8", "t9" - ); -} - void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { __asm__ __volatile__ ( ".set push \n" @@ -927,9 +844,9 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, } // Bilinear filter 8x2 -> 8x1 -void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { int y0_fraction = 256 - source_y_fraction; const uint8* src_ptr1 = src_ptr + src_stride; diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc index 1392cf5fc..8badc5a9b 100644 --- a/third_party/libyuv/source/row_neon.cc +++ b/third_party/libyuv/source/row_neon.cc @@ -16,7 +16,8 @@ extern "C" { #endif // This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ @@ -92,36 +93,73 @@ extern "C" { "vuzp.u8 d2, d3 \n" \ "vtrn.u32 d2, d3 \n" -#define YUV422TORGB \ - "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ - "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ - "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ - "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ - "vtrn.u8 d0, d1 \n" \ - "vsub.s16 q0, q0, q15 \n"/* offset y */\ - "vmul.s16 q0, q0, q14 \n" \ - "vadd.s16 d18, d19 \n" \ - "vqadd.s16 d20, d0, d16 \n" /* B */ \ - "vqadd.s16 d21, d1, d16 \n" \ - "vqadd.s16 d22, d0, d17 \n" /* R */ \ - "vqadd.s16 d23, d1, d17 \n" \ - "vqadd.s16 d16, d0, d18 \n" /* G */ \ - "vqadd.s16 d17, d1, d18 \n" \ - "vqshrun.s16 d0, q10, #6 \n" /* B */ \ - "vqshrun.s16 d1, q11, #6 \n" /* G */ \ - "vqshrun.s16 d2, q8, #6 \n" /* R */ \ - "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ - "vmovl.u8 q11, d1 \n" \ - "vmovl.u8 q8, d2 \n" \ - "vtrn.u8 d20, d21 \n" \ - "vtrn.u8 d22, d23 \n" \ - "vtrn.u8 d16, d17 \n" \ - "vmov.u8 d21, d16 \n" +#define YUV422TORGB_SETUP_REG \ + "vld1.8 {d24}, [%[kUVToRB]] \n" \ + "vld1.8 {d25}, [%[kUVToG]] \n" \ + "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" -static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, - 0, 0, 0, 0, 0, 0, 0, 0 }; -static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, - 0, 0, 0, 0, 0, 0, 0, 0 }; +#define YUV422TORGB \ + "vmull.u8 q8, d2, d24 \n" /* u/v B/R component */\ + "vmull.u8 q9, d2, d25 \n" /* u/v G component */\ + "vmovl.u8 q0, d0 \n" /* Y */\ + "vmovl.s16 q10, d1 \n" \ + "vmovl.s16 q0, d0 \n" \ + "vmul.s32 q10, q10, q15 \n" \ + "vmul.s32 q0, q0, q15 \n" \ + "vqshrun.s32 d0, q0, #16 \n" \ + "vqshrun.s32 d1, q10, #16 \n" /* Y */\ + "vadd.s16 d18, d19 \n" \ + "vshll.u16 q1, d16, #16 \n" /* Replicate u * UB */\ + "vshll.u16 q10, d17, #16 \n" /* Replicate v * VR */\ + "vshll.u16 q3, d18, #16 \n" /* Replicate (v*VG + u*UG)*/\ + "vaddw.u16 q1, q1, d16 \n" \ + "vaddw.u16 q10, q10, d17 \n" \ + "vaddw.u16 q3, q3, d18 \n" \ + "vqadd.s16 q8, q0, q13 \n" /* B */ \ + "vqadd.s16 q9, q0, q14 \n" /* R */ \ + "vqadd.s16 q0, q0, q4 \n" /* G */ \ + "vqadd.s16 q8, q8, q1 \n" /* B */ \ + "vqadd.s16 q9, q9, q10 \n" /* R */ \ + "vqsub.s16 q0, q0, q3 \n" /* G */ \ + "vqshrun.s16 d20, q8, #6 \n" /* B */ \ + "vqshrun.s16 d22, q9, #6 \n" /* R */ \ + "vqshrun.s16 d21, q0, #6 \n" /* G */ + +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + +static uvec8 kUVToRB = { 128, 128, 128, 128, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; +static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -129,13 +167,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV444 @@ -150,8 +182,10 @@ void I444ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -163,13 +197,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -184,8 +212,10 @@ void I422ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -197,13 +227,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV411 @@ -218,8 +242,10 @@ void I411ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -231,13 +257,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -253,8 +273,10 @@ void I422ToBGRARow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_bgra), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -266,13 +288,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -288,8 +304,10 @@ void I422ToABGRRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_abgr), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -301,13 +319,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -322,8 +334,10 @@ void I422ToRGBARow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgba), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -335,13 +349,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -355,8 +363,10 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb24), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -368,13 +378,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -389,8 +393,10 @@ void I422ToRAWRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_raw), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -414,13 +420,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -435,8 +435,10 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb565), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -463,13 +465,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV422 @@ -485,8 +481,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb1555), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -507,13 +505,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. ".p2align 2 \n" "1: \n" @@ -530,8 +522,10 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb4444), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : [kUVToRB]"r"(&kUVToRB), // %5 + [kUVToG]"r"(&kUVToG), // %6 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -541,13 +535,7 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUV400 @@ -560,8 +548,10 @@ void YToARGBRow_NEON(const uint8* src_y, : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -595,13 +585,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READNV12 @@ -615,8 +599,10 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -627,13 +613,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READNV21 @@ -647,8 +627,10 @@ void NV21ToARGBRow_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -659,13 +641,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READNV12 @@ -679,8 +655,10 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -691,13 +669,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READNV21 @@ -711,8 +683,10 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : [kUVToRB]"r"(&kUVToRB), // %4 + [kUVToG]"r"(&kUVToG), // %5 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -722,13 +696,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READYUY2 @@ -741,8 +709,10 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -752,13 +722,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" + YUV422TORGB_SETUP_REG ".p2align 2 \n" "1: \n" READUYVY @@ -771,8 +735,10 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 + : [kUVToRB]"r"(&kUVToRB), // %3 + [kUVToG]"r"(&kUVToG), // %4 + [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -844,12 +810,28 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ); } -// SetRow8 writes 'count' bytes using a 32 bit value repeated. -void SetRow_NEON(uint8* dst, uint32 v32, int count) { +// SetRow writes 'count' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8* dst, uint8 v8, int count) { + asm volatile ( + "vdup.8 q0, %2 \n" // duplicate 16 bytes + "1: \n" + "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store + "bgt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v8) // %2 + : "cc", "memory", "q0" + ); +} + +// ARGBSetRow writes 'count' pixels using an 32 bit value repeated. +void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop + "1: \n" + "subs %1, %1, #4 \n" // 4 pixels per loop MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" @@ -860,16 +842,6 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { ); } -// TODO(fbarchard): Make fully assembler -// SetRow32 writes 'count' words using a 32 bit value repeated. -void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - SetRow_NEON(dst, v32, width << 2); - dst += dst_stride; - } -} - void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { asm volatile ( // Start at end of source row. @@ -1273,53 +1245,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ); } -void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %0 \n" - "1: \n" - MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. - "vrhadd.u8 q0, q1 \n" // average row 1 and 2 - MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(src_uv_stride), // %1 - "+r"(dst_uv), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - -// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG -void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - asm volatile ( - "vmov.u32 d6[0], %3 \n" // selector - "1: \n" - MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels - "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels - "vtrn.u32 d4, d5 \n" // combine 8 pixels - MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" // store 8. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : "r"(selector) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); -} - // Select G channels from ARGB. e.g. GGGGGGGG void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 /*selector*/, int pix) { @@ -2832,7 +2757,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q9, d18 \n" // g "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q15, d22 \n" // a + "vmovl.u8 q11, d22 \n" // a "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R @@ -2853,10 +2778,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A + "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B + "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G + "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R @@ -2872,7 +2797,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -3140,7 +3065,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "q0", "q1" // Clobber List ); } -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index 21111cf60..ddccd5d98 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -1,5 +1,5 @@ /* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * Copyright 2014 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -15,113 +15,157 @@ namespace libyuv { extern "C" { #endif -// This module is for GCC Neon +// This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ - "vld1.32 {d2[0]}, [%1]! \n" \ + "ld1 {v1.s}[0], [%1], #4 \n" \ MEMACCESS(2) \ - "vld1.32 {d2[1]}, [%2]! \n" + "ld1 {v1.s}[1], [%2], #4 \n" // Read 8 Y, 2 U and 2 V from 422 #define READYUV411 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ - "vld1.16 {d2[0]}, [%1]! \n" \ + "ld1 {v2.h}[0], [%1], #2 \n" \ MEMACCESS(2) \ - "vld1.16 {d2[1]}, [%2]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vzip.u8 d2, d3 \n" + "ld1 {v2.h}[1], [%2], #2 \n" \ + "zip1 v1.8b, v2.8b, v2.8b \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ + "ld1 {v1.d}[0], [%1], #8 \n" \ MEMACCESS(2) \ - "vld1.8 {d3}, [%2]! \n" \ - "vpaddl.u8 q1, q1 \n" \ - "vrshrn.u16 d2, q1, #1 \n" + "ld1 {v1.d}[1], [%2], #8 \n" \ + "uaddlp v1.8h, v1.16b \n" \ + "rshrn v1.8b, v1.8h, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ - "vmov.u8 d2, #128 \n" + "ld1 {v0.8b}, [%0], #8 \n" \ + "movi v1.8b , #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 Y and 4 VU from NV21 #define READNV21 \ MEMACCESS(0) \ - "vld1.8 {d0}, [%0]! \n" \ + "ld1 {v0.8b}, [%0], #8 \n" \ MEMACCESS(1) \ - "vld1.8 {d2}, [%1]! \n" \ - "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ - "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" + "ld1 {v2.8b}, [%1], #8 \n" \ + "uzp1 v3.8b, v2.8b, v2.8b \n" \ + "uzp2 v1.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 YUY2 #define READYUY2 \ MEMACCESS(0) \ - "vld2.8 {d0, d2}, [%0]! \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ + "uzp2 v3.8b, v1.8b, v1.8b \n" \ + "uzp1 v1.8b, v1.8b, v1.8b \n" \ + "ins v1.s[1], v3.s[0] \n" // Read 8 UYVY #define READUYVY \ MEMACCESS(0) \ - "vld2.8 {d2, d3}, [%0]! \n" \ - "vmov.u8 d0, d3 \n" \ - "vmov.u8 d3, d2 \n" \ - "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" + "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ + "orr v0.8b, v3.8b, v3.8b \n" \ + "uzp1 v1.8b, v2.8b, v2.8b \n" \ + "uzp2 v3.8b, v2.8b, v2.8b \n" \ + "ins v1.s[1], v3.s[0] \n" -#define YUV422TORGB \ - "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ - "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ - "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ - "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ - "vtrn.u8 d0, d1 \n" \ - "vsub.s16 q0, q0, q15 \n"/* offset y */\ - "vmul.s16 q0, q0, q14 \n" \ - "vadd.s16 d18, d19 \n" \ - "vqadd.s16 d20, d0, d16 \n" /* B */ \ - "vqadd.s16 d21, d1, d16 \n" \ - "vqadd.s16 d22, d0, d17 \n" /* R */ \ - "vqadd.s16 d23, d1, d17 \n" \ - "vqadd.s16 d16, d0, d18 \n" /* G */ \ - "vqadd.s16 d17, d1, d18 \n" \ - "vqshrun.s16 d0, q10, #6 \n" /* B */ \ - "vqshrun.s16 d1, q11, #6 \n" /* G */ \ - "vqshrun.s16 d2, q8, #6 \n" /* R */ \ - "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ - "vmovl.u8 q11, d1 \n" \ - "vmovl.u8 q8, d2 \n" \ - "vtrn.u8 d20, d21 \n" \ - "vtrn.u8 d22, d23 \n" \ - "vtrn.u8 d16, d17 \n" \ - "vmov.u8 d21, d16 \n" +#define YUV422TORGB_SETUP_REG \ + "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ + "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ + "ld1r {v31.4s}, [%[kYToRgb]] \n" \ + "movi v27.8h, #128 \n" \ + "movi v28.8h, #102 \n" \ + "movi v29.8h, #25 \n" \ + "movi v30.8h, #52 \n" + +#define YUV422TORGB(vR, vG, vB) \ + "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ + "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ + "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ + "ushll v0.4s, v0.4h, #0 \n" \ + "mul v3.4s, v3.4s, v31.4s \n" \ + "mul v0.4s, v0.4s, v31.4s \n" \ + "sqshrun v0.4h, v0.4s, #16 \n" \ + "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ + "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ + "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ + "uxtl v2.8h, v2.8b \n" \ + "uxtl v1.8h, v1.8b \n" /* Extract U */ \ + "mul v3.8h, v1.8h, v27.8h \n" \ + "mul v5.8h, v1.8h, v29.8h \n" \ + "mul v6.8h, v2.8h, v30.8h \n" \ + "mul v7.8h, v2.8h, v28.8h \n" \ + "sqadd v6.8h, v6.8h, v5.8h \n" \ + "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ + "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ + "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ + "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ + "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ + "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ + "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ + "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ + "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ + +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + +static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; +static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ -static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, - 0, 0, 0, 0, 0, 0, 0, 0 }; -static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, - 0, 0, 0, 0, 0, 0, 0, 0 }; #ifdef HAS_I444TOARGBROW_NEON void I444ToARGBRow_NEON(const uint8* src_y, @@ -130,31 +174,24 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV444 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I444TOARGBROW_NEON @@ -166,31 +203,24 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TOARGBROW_NEON @@ -202,31 +232,24 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV411 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I411TOARGBROW_NEON @@ -238,32 +261,24 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v21, v22, v23) "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" - "vmov.u8 d19, #255 \n" + "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_bgra), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TOBGRAROW_NEON @@ -275,32 +290,24 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v20, v21, v22) "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d20, d21, d22, d23}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_abgr), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TOABGRROW_NEON @@ -312,31 +319,24 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v23, v22, v21) "subs %4, %4, #8 \n" - "vmov.u8 d19, #255 \n" + "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) - "vst4.8 {d19, d20, d21, d22}, [%3]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_rgba), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TORGBAROW_NEON @@ -348,30 +348,23 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_rgb24), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TORGB24ROW_NEON @@ -383,46 +376,33 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v20, v21, v22) "subs %4, %4, #8 \n" - "vswp.u8 d20, d22 \n" MEMACCESS(3) - "vst3.8 {d20, d21, d22}, [%3]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_raw), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TORAWROW_NEON #define ARGBTORGB565 \ - "vshr.u8 d20, d20, #3 \n" /* B */ \ - "vshr.u8 d21, d21, #2 \n" /* G */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #11 \n" /* R */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q0, q0, q10 \n" /* BGR */ + "shll v0.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v21.8h, #5 \n" /* RG */ \ + "sri v0.8h, v20.8h, #11 \n" /* RGB */ #ifdef HAS_I422TORGB565ROW_NEON void I422ToRGB565Row_NEON(const uint8* src_y, @@ -431,49 +411,36 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" ARGBTORGB565 MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_rgb565), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TORGB565ROW_NEON #define ARGBTOARGB1555 \ - "vshr.u8 q10, q10, #3 \n" /* B */ \ - "vshr.u8 d22, d22, #3 \n" /* R */ \ - "vshr.u8 d23, d23, #7 \n" /* A */ \ - "vmovl.u8 q8, d20 \n" /* B */ \ - "vmovl.u8 q9, d21 \n" /* G */ \ - "vmovl.u8 q10, d22 \n" /* R */ \ - "vmovl.u8 q11, d23 \n" /* A */ \ - "vshl.u16 q9, q9, #5 \n" /* G */ \ - "vshl.u16 q10, q10, #10 \n" /* R */ \ - "vshl.u16 q11, q11, #15 \n" /* A */ \ - "vorr q0, q8, q9 \n" /* BG */ \ - "vorr q1, q10, q11 \n" /* RA */ \ - "vorr q0, q0, q1 \n" /* BGRA */ + "shll v0.8h, v23.8b, #8 \n" /* A */ \ + "shll v22.8h, v22.8b, #8 \n" /* R */ \ + "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ + "sri v0.8h, v22.8h, #1 \n" /* AR */ \ + "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ + "sri v0.8h, v20.8h, #11 \n" /* ARGB */ #ifdef HAS_I422TOARGB1555ROW_NEON void I422ToARGB1555Row_NEON(const uint8* src_y, @@ -482,44 +449,38 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_argb1555), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TOARGB1555ROW_NEON #define ARGBTOARGB4444 \ - "vshr.u8 d20, d20, #4 \n" /* B */ \ - "vbic.32 d21, d21, d4 \n" /* G */ \ - "vshr.u8 d22, d22, #4 \n" /* R */ \ - "vbic.32 d23, d23, d4 \n" /* A */ \ - "vorr d0, d20, d21 \n" /* BG */ \ - "vorr d1, d22, d23 \n" /* RA */ \ - "vzip.u8 d0, d1 \n" /* BGRA */ + /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ + "ushr v20.8b, v20.8b, #4 \n" /* B */ \ + "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ + "ushr v22.8b, v22.8b, #4 \n" /* R */ \ + "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ + "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ + "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ + "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ #ifdef HAS_I422TOARGB4444ROW_NEON void I422ToARGB4444Row_NEON(const uint8* src_y, @@ -528,33 +489,26 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( - MEMACCESS(5) - "vld1.8 {d24}, [%5] \n" - MEMACCESS(6) - "vld1.8 {d25}, [%6] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" + YUV422TORGB_SETUP_REG + "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" READYUV422 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %4, %4, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" ARGBTOARGB4444 MEMACCESS(3) - "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_argb4444), // %3 "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_I422TOARGB4444ROW_NEON @@ -564,29 +518,22 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV400 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_YTOARGBROW_NEON @@ -596,22 +543,21 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - ".p2align 2 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d20}, [%0]! \n" - "vmov d21, d20 \n" - "vmov d22, d20 \n" + "ld1 {v20.8b}, [%0], #8 \n" + "orr v21.8b, v20.8b, v20.8b \n" + "orr v22.8b, v20.8b, v20.8b \n" "subs %2, %2, #8 \n" MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "d20", "d21", "d22", "d23" + : "cc", "memory", "v20", "v21", "v22", "v23" ); } #endif // HAS_I400TOARGBROW_NEON @@ -622,30 +568,23 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV12 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_NV12TOARGBROW_NEON @@ -656,30 +595,23 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV21 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %3, %3, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" MEMACCESS(2) - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_NV21TOARGBROW_NEON @@ -690,30 +622,23 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV12 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %3, %3, #8 \n" ARGBTORGB565 MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_NV12TORGB565ROW_NEON @@ -724,30 +649,23 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - MEMACCESS(4) - "vld1.8 {d24}, [%4] \n" - MEMACCESS(5) - "vld1.8 {d25}, [%5] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV21 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %3, %3, #8 \n" ARGBTORGB565 MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_NV21TORGB565ROW_NEON @@ -757,29 +675,22 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUY2 - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_YUY2TOARGBROW_NEON @@ -789,29 +700,22 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( - MEMACCESS(3) - "vld1.8 {d24}, [%3] \n" - MEMACCESS(4) - "vld1.8 {d25}, [%4] \n" - "vmov.u8 d26, #128 \n" - "vmov.u16 q14, #74 \n" - "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + YUV422TORGB_SETUP_REG "1: \n" READUYVY - YUV422TORGB + YUV422TORGB(v22, v21, v20) "subs %2, %2, #8 \n" - "vmov.u8 d23, #255 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "r"(&kUVToRB), // %3 - "r"(&kUVToG) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } #endif // HAS_UYVYTOARGBROW_NEON @@ -821,16 +725,15 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store U MEMACCESS(2) "st1 {v1.16b}, [%2], #16 \n" // store V - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -846,7 +749,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load U @@ -854,8 +756,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(2) - "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV - "bgt 1b \n" + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 @@ -871,14 +773,13 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #ifdef HAS_COPYROW_NEON void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop MEMACCESS(1) - "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 - "bgt 1b \n" + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(count) // %2 // Output registers @@ -888,35 +789,36 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_NEON -// SetRow8 writes 'count' bytes using a 32 bit value repeated. -#ifdef HAS_SETROW_NEON -void SetRow_NEON(uint8* dst, uint32 v32, int count) { +// SetRow writes 'count' bytes using an 8 bit value repeated. +void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( - "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" + "dup v0.16b, %w2 \n" // duplicate 16 bytes + "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store - "bgt 1b \n" + "b.gt 1b \n" + : "+r"(dst), // %0 + "+r"(count) // %1 + : "r"(v8) // %2 + : "cc", "memory", "v0" + ); +} + +void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { + asm volatile ( + "dup v0.4s, %w2 \n" // duplicate 4 ints + "1: \n" + "subs %1, %1, #4 \n" // 4 ints per loop + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 : "cc", "memory", "v0" ); } -#endif // HAS_SETROW_NEON - -// TODO(fbarchard): Make fully assembler -// SetRow32 writes 'count' words using a 32 bit value repeated. -#ifdef HAS_ARGBSETROWS_NEON -void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - SetRow_NEON(dst, v32, width << 2); - dst += dst_stride; - } -} -#endif // HAS_ARGBSETROWS_NEON #ifdef HAS_MIRRORROW_NEON void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { @@ -925,7 +827,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2 \n" "sub %0, %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 @@ -935,7 +836,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) "st1 {v0.D}[0], [%1], #8 \n" - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -953,7 +854,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "add %0, %0, %3, lsl #1 \n" "sub %0, %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 @@ -961,10 +861,10 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "rev64 v0.8b, v0.8b \n" "rev64 v1.8b, v1.8b \n" MEMACCESS(1) - "st1 {v0.8b}, [%1], #8 \n" // dst += 8 + "st1 {v0.8b}, [%1], #8 \n" // dst += 8 MEMACCESS(2) - "st1 {v1.8b}, [%2], #8 \n" - "bgt 1b \n" + "st1 {v1.8b}, [%2], #8 \n" + "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -982,7 +882,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 @@ -992,7 +891,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 MEMACCESS(1) "st1 {v0.D}[0], [%1], #8 \n" - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -1006,14 +905,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "movi v4.8b, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -1027,16 +925,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( "movi v5.8b, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %2, %2, #8 \n" // 8 processed per loop. - "mov v3.8b, v1.8b \n" // move g - "mov v4.8b, v0.8b \n" // move r + "orr v3.8b, v1.8b, v1.8b \n" // move g + "orr v4.8b, v0.8b, v0.8b \n" // move r MEMACCESS(1) - "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a - "bgt 1b \n" + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a + "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -1047,118 +944,127 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { #endif // HAS_RAWTOARGBROW_NEON #define RGB565TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ - "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ - "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ + "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ + "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ + "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ + "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ + "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ + "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ + "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ + "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ + "dup v2.2D, v0.D[1] \n" /* R */ #ifdef HAS_RGB565TOARGBROW_NEON void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" + "movi v3.8b, #255 \n" // Alpha "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List ); } #endif // HAS_RGB565TOARGBROW_NEON #define ARGB1555TOARGB \ - "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ - "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ - "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ - "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ - "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ - "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ - "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ - "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ - "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ - "vorr.u8 q1, q1, q3 \n" /* R,A */ \ - "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ + \ + "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ + "xtn2 v3.16b, v2.8h \n" \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ + "dup v1.2D, v0.D[1] \n" \ + "dup v3.2D, v2.D[1] \n" // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. #define RGB555TOARGB \ - "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ - "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ - "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ - "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ - "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ - "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ - "vorr.u8 d0, d0, d4 \n" /* B */ \ - "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ - "vorr.u8 d2, d1, d5 \n" /* R */ \ - "vorr.u8 d1, d4, d6 \n" /* G */ + "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ + "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ + "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ + \ + "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ + "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ + \ + "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ + "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ + \ + "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ + "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ + "dup v1.2D, v0.D[1] \n" /* G */ \ #ifdef HAS_ARGB1555TOARGBROW_NEON void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" + "movi v3.8b, #255 \n" // Alpha "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_ARGB1555TOARGBROW_NEON #define ARGB4444TOARGB \ - "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ - "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ - "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ - "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ - "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ - "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ - "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ - "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ + "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ + "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ + "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ + "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ + "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ + "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ + "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ + "dup v0.2D, v2.D[1] \n" \ + "dup v1.2D, v3.D[1] \n" #ifdef HAS_ARGB4444TOARGBROW_NEON void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_ARGB4444TOARGBROW_NEON @@ -1166,14 +1072,13 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, #ifdef HAS_ARGBTORGB24ROW_NEON void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. - "bgt 1b \n" + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(pix) // %2 @@ -1186,16 +1091,15 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { #ifdef HAS_ARGBTORAWROW_NEON void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %2, %2, #8 \n" // 8 processed per loop. - "mov v4.8b, v2.8b \n" // mov g - "mov v5.8b, v1.8b \n" // mov b + "orr v4.8b, v2.8b, v2.8b \n" // mov g + "orr v5.8b, v1.8b, v1.8b \n" // mov b MEMACCESS(1) - "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b - "bgt 1b \n" + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(pix) // %2 @@ -1208,14 +1112,13 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { #ifdef HAS_YUY2TOYROW_NEON void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1228,14 +1131,13 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { #ifdef HAS_UYVYTOYROW_NEON void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1249,16 +1151,15 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v1.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) "st1 {v3.8b}, [%2], #8 \n" // store 8 V. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1273,16 +1174,15 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v0.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) "st1 {v2.8b}, [%2], #8 \n" // store 8 V. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1296,29 +1196,29 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, #ifdef HAS_YUY2TOUVROW_NEON void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile ( - "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V MEMACCESS(2) "st1 {v1.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) "st1 {v3.8b}, [%3], #8 \n" // store 8 V. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 + "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List ); } #endif // HAS_YUY2TOUVROW_NEON @@ -1326,84 +1226,33 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, #ifdef HAS_UYVYTOUVROW_NEON void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile ( - "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) "st1 {v2.8b}, [%3], #8 \n" // store 8 V. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 + "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v5", "v6", "v7" // Clobber List ); } #endif // HAS_UYVYTOUVROW_NEON -#ifdef HAS_HALFROW_NEON -void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - // change the stride to row 2 pointer - "add %x1, %x0, %w1, sxtw \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. - "subs %3, %3, #16 \n" // 16 processed per loop - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. - "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 - MEMACCESS(2) - "st1 {v0.16b}, [%2], #16 \n" - "bgt 1b \n" - : "+r"(src_uv), // %0 - "+r"(src_uv_stride), // %1 - "+r"(dst_uv), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "v0", "v1" // Clobber List - ); -} -#endif // HAS_HALFROW_NEON - -// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG -#ifdef HAS_ARGBTOBAYERROW_NEON -void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - asm volatile ( - "mov v2.s[0], %w3 \n" // selector - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels - "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels - "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels - MEMACCESS(1) - "st1 {v4.8b}, [%1], #8 \n" // store 8. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : "r"(selector) // %3 - : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List - ); -} -#endif // HAS_ARGBTOBAYERROW_NEON - // Select G channels from ARGB. e.g. GGGGGGGG #ifdef HAS_ARGBTOBAYERGGROW_NEON void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, @@ -1411,11 +1260,11 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 @@ -1439,7 +1288,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 4. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -1455,19 +1304,18 @@ void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_yuy2, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "mov v2.8b, v1.8b \n" + "orr v2.8b, v1.8b, v1.8b \n" MEMACCESS(1) "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1485,19 +1333,18 @@ void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_uyvy, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys - "mov v3.8b, v2.8b \n" + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys + "orr v3.8b, v2.8b, v2.8b \n" MEMACCESS(1) "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -1512,20 +1359,19 @@ void I422ToUYVYRow_NEON(const uint8* src_y, #ifdef HAS_ARGBTORGB565ROW_NEON void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. - "bgt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTORGB565ROW_NEON @@ -1534,20 +1380,19 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. - "bgt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTOARGB1555ROW_NEON @@ -1556,21 +1401,20 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, int pix) { asm volatile ( - "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" + "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" MEMACCESS(0) - "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. - "bgt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q8", "q9", "q10", "q11" + : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" ); } #endif // HAS_ARGBTOARGB4444ROW_NEON @@ -1582,10 +1426,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1594,7 +1437,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1610,10 +1453,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1621,7 +1463,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y MEMACCESS(1) "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -1636,41 +1478,41 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient - "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient - "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient - "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient - "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient + "movi v25.8b, #74 \n" // UG -0.5781 coefficient + "movi v26.8b, #38 \n" // UR -0.2969 coefficient + "movi v27.8b, #18 \n" // VB -0.1406 coefficient + "movi v28.8b, #94 \n" // VG -0.7344 coefficient + "movi v29.16b,#0x80 \n" // 128.5 "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R - "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R + "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B - "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned + "umull v3.8h, v2.8b, v24.8b \n" // R + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B + "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned - "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", + "v24", "v25", "v26", "v27", "v28", "v29" ); } #endif // HAS_ARGBTOUV444ROW_NEON @@ -1680,49 +1522,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "subs %3, %3, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBTOUV422ROW_NEON @@ -1732,128 +1566,108 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(0) - "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. - MEMACCESS(0) - "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. - "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. + "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. - "vpadd.u16 d1, d8, d9 \n" // B - "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. - "vpadd.u16 d3, d10, d11 \n" // G - "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. - "vpadd.u16 d5, d12, d13 \n" // R + "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. + "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. + "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %3, %3, #32 \n" // 32 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBTOUV411ROW_NEON // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ - "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ - "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ - "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ - "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ + "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ + "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ + "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ + "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + #ifdef HAS_ARGBTOUVROW_NEON void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - MEMACCESS(1) - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + MEMACCESS(1) + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q0, q1, q2) + RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 + "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBTOUVROW_NEON @@ -1862,50 +1676,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, #ifdef HAS_ARGBTOUVJROW_NEON void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient - "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient - "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q0, q1, q2) + RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb), // %0 - "+r"(src_stride_argb), // %1 + "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBTOUVJROW_NEON @@ -1913,50 +1722,40 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, #ifdef HAS_BGRATOUVROW_NEON void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. - "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. - MEMACCESS(1) - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. - "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more + "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q1, q1, #1 \n" // 2x average - "vrshr.u16 q2, q2, #1 \n" - "vrshr.u16 q3, q3, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v3.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q3, q2, q1) + RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_bgra), // %0 - "+r"(src_stride_bgra), // %1 + "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_BGRATOUVROW_NEON @@ -1964,50 +1763,40 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, #ifdef HAS_ABGRTOUVROW_NEON void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. - MEMACCESS(1) - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v0.8h, v3.8h, #1 \n" // 2x average + "urshr v2.8h, v2.8h, #1 \n" + "urshr v1.8h, v1.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q2, q1, q0) + RGBTOUV(v0.8h, v2.8h, v1.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_abgr), // %0 - "+r"(src_stride_abgr), // %1 + "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_ABGRTOUVROW_NEON @@ -2015,50 +1804,40 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, #ifdef HAS_RGBATOUVROW_NEON void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. - "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. - MEMACCESS(1) - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. - "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q0, q1, q2) + RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgba), // %0 - "+r"(src_stride_rgba), // %1 + "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_RGBATOUVROW_NEON @@ -2066,50 +1845,40 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, #ifdef HAS_RGB24TOUVROW_NEON void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. - MEMACCESS(0) - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. - MEMACCESS(1) - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. + "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q0, q1, q2) + RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 - "+r"(src_stride_rgb24), // %1 + "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_RGB24TOUVROW_NEON @@ -2117,50 +1886,40 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, #ifdef HAS_RAWTOUVROW_NEON void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_raw_1 = src_raw + src_stride_raw; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. - MEMACCESS(0) - "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. - "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(1) - "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. - MEMACCESS(1) - "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. - "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. - "vrshr.u16 q0, q0, #1 \n" // 2x average - "vrshr.u16 q1, q1, #1 \n" - "vrshr.u16 q2, q2, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v0.8h, v0.8h, #1 \n" "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV(q2, q1, q0) + RGBTOUV(v2.8h, v1.8h, v0.8h) MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_raw), // %0 - "+r"(src_stride_raw), // %1 + "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" ); } #endif // HAS_RAWTOUVROW_NEON @@ -2169,70 +1928,74 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, #ifdef HAS_RGB565TOUVROW_NEON void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 + "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 + "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 + "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 + "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 + "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "ins v16.D[1], v17.D[0] \n" + "ins v18.D[1], v19.D[0] \n" + "ins v20.D[1], v21.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v18.8h, #1 \n" + "urshr v6.8h, v20.8h, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "mul v16.8h, v4.8h, v22.8h \n" // B + "mls v16.8h, v5.8h, v23.8h \n" // G + "mls v16.8h, v6.8h, v24.8h \n" // R + "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned + "mul v17.8h, v6.8h, v22.8h \n" // R + "mls v17.8h, v5.8h, v26.8h \n" // G + "mls v17.8h, v4.8h, v25.8h \n" // B + "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 - "+r"(src_stride_rgb565), // %1 + "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27" ); } #endif // HAS_RGB565TOUVROW_NEON @@ -2241,70 +2004,69 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, #ifdef HAS_ARGB1555TOUVROW_NEON void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 - "+r"(src_stride_argb1555), // %1 + "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" ); } #endif // HAS_ARGB1555TOUVROW_NEON @@ -2313,70 +2075,70 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, #ifdef HAS_ARGB4444TOUVROW_NEON void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + RGBTOUV_SETUP_REG "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB - "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. - "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. - "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. + "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. - "vrshr.u16 q4, q4, #1 \n" // 2x average - "vrshr.u16 q5, q5, #1 \n" - "vrshr.u16 q6, q6, #1 \n" + "ins v16.D[1], v26.D[0] \n" + "ins v17.D[1], v27.D[0] \n" + "ins v18.D[1], v28.D[0] \n" + + "urshr v4.8h, v16.8h, #1 \n" // 2x average + "urshr v5.8h, v17.8h, #1 \n" + "urshr v6.8h, v18.8h, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. - "vmul.s16 q8, q4, q10 \n" // B - "vmls.s16 q8, q5, q11 \n" // G - "vmls.s16 q8, q6, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q6, q10 \n" // R - "vmls.s16 q9, q5, q14 \n" // G - "vmls.s16 q9, q4, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "mul v2.8h, v4.8h, v20.8h \n" // B + "mls v2.8h, v5.8h, v21.8h \n" // G + "mls v2.8h, v6.8h, v22.8h \n" // R + "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned + "mul v3.8h, v6.8h, v20.8h \n" // R + "mls v3.8h, v5.8h, v24.8h \n" // G + "mls v3.8h, v4.8h, v23.8h \n" // B + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. MEMACCESS(3) - "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. - "bgt 1b \n" + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 - "+r"(src_stride_argb4444), // %1 + "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28" + ); } #endif // HAS_ARGB4444TOUVROW_NEON @@ -2384,29 +2146,29 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, #ifdef HAS_RGB565TOYROW_NEON void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", + "v24", "v25", "v26", "v27" ); } #endif // HAS_RGB565TOYROW_NEON @@ -2414,29 +2176,28 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { #ifdef HAS_ARGB1555TOYROW_NEON void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" + "umull v3.8h, v0.8b, v4.8b \n" // B + "umlal v3.8h, v1.8b, v5.8b \n" // G + "umlal v3.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGB1555TOYROW_NEON @@ -2444,29 +2205,28 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { #ifdef HAS_ARGB4444TOYROW_NEON void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v24.8b, #13 \n" // B * 0.1016 coefficient + "movi v25.8b, #65 \n" // G * 0.5078 coefficient + "movi v26.8b, #33 \n" // R * 0.2578 coefficient + "movi v27.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d27 \n" + "umull v3.8h, v0.8b, v24.8b \n" // B + "umlal v3.8h, v1.8b, v25.8b \n" // G + "umlal v3.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v27.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" ); } #endif // HAS_ARGB4444TOYROW_NEON @@ -2474,28 +2234,27 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { #ifdef HAS_BGRATOYROW_NEON void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_BGRATOYROW_NEON @@ -2503,28 +2262,27 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { #ifdef HAS_ABGRTOYROW_NEON void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 + "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_ABGRTOYROW_NEON @@ -2532,28 +2290,27 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { #ifdef HAS_RGBATOYROW_NEON void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 + "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RGBATOYROW_NEON @@ -2561,28 +2318,27 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { #ifdef HAS_RGB24TOYROW_NEON void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(dst_y), // %1 + "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RGB24TOYROW_NEON @@ -2590,28 +2346,27 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { #ifdef HAS_RAWTOYROW_NEON void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant "1: \n" MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. - "bgt 1b \n" + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(dst_y), // %1 + "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RAWTOYROW_NEON @@ -2621,96 +2376,98 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; asm volatile ( "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" + "b.eq 100f \n" "cmp %4, #64 \n" - "beq 75f \n" + "b.eq 75f \n" "cmp %4, #128 \n" - "beq 50f \n" + "b.eq 50f \n" "cmp %4, #192 \n" - "beq 25f \n" + "b.eq 25f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" + "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" + "ld1 {v0.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" "subs %3, %3, #16 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 + "+r"(src_ptr1), // %2 "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + : "cc", "memory", "v0", "v1", "v3", "v4", "v5" ); } #endif // HAS_INTERPOLATEROW_NEON @@ -2720,55 +2477,59 @@ void InterpolateRow_NEON(uint8* dst_ptr, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "subs %3, #8 \n" - "blt 89f \n" + "subs %3, %3, #8 \n" + "b.lt 89f \n" // Blend 8 pixels. "8: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. - "bge 8b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.ge 8b \n" "89: \n" - "adds %3, #8-1 \n" - "blt 99f \n" + "adds %3, %3, #8-1 \n" + "b.lt 99f \n" // Blend 1 pixels. "1: \n" MEMACCESS(0) - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. MEMACCESS(1) - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. - "bge 1b \n" + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. + "b.ge 1b \n" "99: \n" @@ -2777,7 +2538,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18" ); } #endif // HAS_ARGBBLENDROW_NEON @@ -2789,22 +2551,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // Attenuate 8 pixels. "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBATTENUATEROW_NEON @@ -2815,41 +2577,40 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" MEMACCESS(0) - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBQUANTIZEROW_NEON @@ -2861,36 +2622,35 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" MEMACCESS(1) - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + : "cc", "memory", "v0", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBSHADEROW_NEON @@ -2901,28 +2661,27 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBGRAYROW_NEON void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "orr v1.8b, v0.8b, v0.8b \n" // G + "orr v2.8b, v0.8b, v0.8b \n" // R MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" ); } #endif // HAS_ARGBGRAYROW_NEON @@ -2935,40 +2694,39 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBSEPIAROW_NEON void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient - ".p2align 2 \n" + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R MEMACCESS(0) - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" ); } #endif // HAS_ARGBSEPIAROW_NEON @@ -2981,60 +2739,59 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q15, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A MEMACCESS(1) - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBCOLORMATRIXROW_NEON @@ -3046,12 +2803,11 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v1.8h, v1.8b, v5.8b \n" // multiply G @@ -3062,8 +2818,8 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -3081,20 +2837,19 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -3112,20 +2867,19 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -3148,7 +2902,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. @@ -3156,11 +2909,11 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add - "mov v1.8b, v0.8b \n" - "mov v2.8b, v0.8b \n" + "orr v1.8b, v0.8b, v0.8b \n" + "orr v2.8b, v0.8b, v0.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -3177,7 +2930,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { asm volatile ( // 16 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. @@ -3187,7 +2939,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "uqadd v0.16b, v0.16b, v1.16b \n" // add MEMACCESS(2) "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 @@ -3209,7 +2961,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. @@ -3218,8 +2969,8 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "subs %3, %3, #8 \n" // 8 processed per loop. "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. - "bgt 1b \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels + "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 @@ -3238,7 +2989,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.8b}, [%0],%5 \n" // top @@ -3263,7 +3013,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "uqxtn v0.8b, v0.8h \n" MEMACCESS(3) "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 @@ -3284,7 +3034,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.8b}, [%0],%4 \n" // left @@ -3309,7 +3058,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "uqxtn v0.8b, v0.8h \n" MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc index 106fda568..1a6f7dc4d 100644 --- a/third_party/libyuv/source/row_posix.cc +++ b/third_party/libyuv/source/row_posix.cc @@ -1,3 +1,4 @@ +// VERSION 2 /* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * @@ -92,6 +93,7 @@ static uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +// 7 bit fixed point 0.5. static vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; @@ -221,7 +223,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "1: \n" "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -229,47 +231,13 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm5" ); } #endif // TESTING #ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -291,11 +259,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm5" ); } #endif // HAS_I400TOARGBROW_SSE2 @@ -318,27 +282,24 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "m"(kShuffleMaskRGB24ToARGB) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -359,27 +320,24 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -417,9 +375,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -427,13 +384,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(dst), // %1 "+r"(pix) // %2 : - : "memory", "cc", "eax" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -474,9 +426,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) - MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2) + MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -484,13 +435,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(dst), // %1 "+r"(pix) // %2 : - : "memory", "cc", "eax" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -518,9 +464,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) - MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2) + MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2) "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -528,13 +473,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(dst), // %1 "+r"(pix) // %2 : - : "memory", "cc", "eax" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -572,10 +512,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "+r"(dst), // %1 "+r"(pix) // %2 : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -613,10 +550,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "+r"(dst), // %1 "+r"(pix) // %2 : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -631,7 +565,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" @@ -652,11 +586,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -672,7 +602,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -690,17 +620,14 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "packssdw %%xmm0,%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMACCESS2(0x8,1) ",%1 \n" + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + :: "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -712,7 +639,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" @@ -728,57 +655,17 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" @@ -796,63 +683,24 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kARGBToY), // %3 "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -874,53 +722,131 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kARGBToYJ), // %3 "m"(kAddYJ64) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_ARGBTOYJROW_SSSE3 +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates. + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding. + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBTOYJROW_AVX2 + #ifdef HAS_ARGBTOUVROW_SSSE3 -// TODO(fbarchard): pass xmm constants to single block of assembly. -// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes -// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, -// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around -// and considered unsafe. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -942,52 +868,114 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } +#endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVROW_AVX2 +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) + VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) + "lea " MEMLEA(0x80,0) ",%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(kARGBToV), // %6 + "m"(kARGBToU), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBTOUVROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_SSSE3 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToUJ), // %0 - "m"(kARGBToVJ), // %1 - "m"(kAddUVJ128) // %2 - ); - asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1010,245 +998,32 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToUJ), // %0 - "m"(kARGBToVJ), // %1 - "m"(kAddUVJ128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 - "pavgb %%xmm7,%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)) - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToVJ), // %5 + "m"(kARGBToUJ), // %6 + "m"(kAddUVJ128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } +#endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6" -#endif - ); -} - -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, - uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" @@ -1266,7 +1041,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" @@ -1283,98 +1057,30 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "lea " MEMLEA(0x40,0) ",%0 \n" - BUNDLEALIGN MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6" -#endif + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6" ); } +#endif // HAS_ARGBTOUV444ROW_SSSE3 +#ifdef HAS_ARGBTOUV422ROW_SSSE3 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kARGBToU), // %0 - "m"(kARGBToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" @@ -1403,65 +1109,25 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "m"(kARGBToV), // %4 + "m"(kARGBToU), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } +#endif // HAS_ARGBTOUV422ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kBGRAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1482,116 +1148,41 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kBGRAToY), // %3 "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kBGRAToU), // %0 - "m"(kBGRAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_bgra0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kBGRAToU), // %0 - "m"(kBGRAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1613,65 +1204,25 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "r"((intptr_t)(src_stride_bgra)), // %4 + "m"(kBGRAToV), // %5 + "m"(kBGRAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kABGRToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1692,60 +1243,20 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kABGRToY), // %3 "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { - asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea " MEMLEA(0x40,0) ",%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "m"(kRGBAToY), // %3 - "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -1766,116 +1277,41 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : "m"(kRGBAToY), // %3 "m"(kAddY16) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kABGRToU), // %0 - "m"(kABGRToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_abgr0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kABGRToU), // %0 - "m"(kABGRToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1897,121 +1333,46 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToV), // %5 + "m"(kABGRToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kRGBAToU), // %0 - "m"(kRGBAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 - "lea " MEMLEA(0x40,0) ",%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_rgba0), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif - ); -} - -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kRGBAToU), // %0 - "m"(kRGBAToV), // %1 - "m"(kAddUV128) // %2 - ); - asm volatile ( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -2033,75 +1394,83 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" -#endif + : "r"((intptr_t)(src_stride_rgba)), // %4 + "m"(kRGBAToV), // %5 + "m"(kRGBAToU), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" ); } -#endif // HAS_ARGBTOUVROW_SSSE3 -#ifdef HAS_I422TOARGBROW_SSSE3 -#define UB 127 /* min(63,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ -#define UR 0 +#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) -#define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ -// Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) -struct { - vec8 kUVToB; // 0 - vec8 kUVToG; // 16 - vec8 kUVToR; // 32 - vec16 kUVBiasB; // 48 - vec16 kUVBiasG; // 64 - vec16 kUVBiasR; // 80 - vec16 kYSub16; // 96 - vec16 kYToRgb; // 112 - vec8 kVUToB; // 128 - vec8 kVUToG; // 144 - vec8 kVUToR; // 160 -} static SIMD_ALIGNED(kYuvConstants) = { - { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, - { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, - { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, - { BB, BB, BB, BB, BB, BB, BB, BB }, - { BG, BG, BG, BG, BG, BG, BG, BG }, - { BR, BR, BR, BR, BR, BR, BR, BR }, - { 16, 16, 16, 16, 16, 16, 16, 16 }, - { YG, YG, YG, YG, YG, YG, YG, YG }, - { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, - { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, - { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } +struct YuvConstants { + lvec8 kUVToB; // 0 + lvec8 kUVToG; // 32 + lvec8 kUVToR; // 64 + lvec16 kUVBiasB; // 96 + lvec16 kUVBiasG; // 128 + lvec16 kUVBiasR; // 160 + lvec16 kYToRgb; // 192 }; +// BT601 constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvConstants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; + +// BT601 constants for NV21 where chroma plane is VU instead of UV. +static YuvConstants SIMD_ALIGNED(kYvuConstants) = { + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } +}; // Read 8 UV from 411 #define READYUV444 \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - BUNDLEALIGN \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" @@ -2109,7 +1478,6 @@ struct { // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - BUNDLEALIGN \ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -2118,7 +1486,6 @@ struct { // Read 2 UV from 411, upsample to 8 UV #define READYUV411 \ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ - BUNDLEALIGN \ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -2132,20 +1499,23 @@ struct { "punpcklwd %%xmm0,%%xmm0 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB \ +#define YUVTORGB(YuvConstants) \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ - "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ - "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ - "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa " MEMACCESS2(96, [YuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS([YuvConstants]) ",%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa " MEMACCESS2(128, [YuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa " MEMACCESS2(160, [YuvConstants]) ",%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ - "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "punpcklbw %%xmm3,%%xmm3 \n" \ + "pmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%xmm3 \n" \ "paddsw %%xmm3,%%xmm0 \n" \ "paddsw %%xmm3,%%xmm1 \n" \ "paddsw %%xmm3,%%xmm2 \n" \ @@ -2156,30 +1526,51 @@ struct { "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" -// Convert 8 pixels: 8 VU and 8 Y -#define YVUTORGB \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ - "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ - "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ - "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ - "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ - "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ - "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ - "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ - "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +// Store 8 ARGB values. Assumes XMM5 is zero. +#define STOREARGB \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklbw %%xmm5,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + +// Store 8 BGRA values. Assumes XMM5 is zero. +#define STOREBGRA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm0,%%xmm1 \n" \ + "punpcklbw %%xmm2,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + +// Store 8 ABGR values. Assumes XMM5 is zero. +#define STOREABGR \ + "punpcklbw %%xmm1,%%xmm2 \n" \ + "punpcklbw %%xmm5,%%xmm0 \n" \ + "movdqa %%xmm2,%%xmm1 \n" \ + "punpcklwd %%xmm0,%%xmm2 \n" \ + "punpckhwd %%xmm0,%%xmm1 \n" \ + "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + +// Store 8 RGBA values. Assumes XMM5 is zero. +#define STORERGBA \ + "pcmpeqb %%xmm5,%%xmm5 \n" \ + "punpcklbw %%xmm2,%%xmm1 \n" \ + "punpcklbw %%xmm0,%%xmm5 \n" \ + "movdqa %%xmm5,%%xmm0 \n" \ + "punpcklwd %%xmm1,%%xmm5 \n" \ + "punpckhwd %%xmm1,%%xmm0 \n" \ + "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2189,19 +1580,11 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV444 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB(kYuvConstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2210,41 +1593,25 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } +// TODO(fbarchard): Consider putting masks into constants. void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* dst_rgb24, int width) { -// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. -#if defined(__i386__) asm volatile ( "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" - :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), - [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); -#endif - - asm volatile ( -#if !defined(__i386__) - "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" -#endif "sub %[u_buf],%[v_buf] \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -2256,25 +1623,23 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" - "sub $0x8,%[width] \n" + "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] +// TODO(fbarchard): Make width a register for 32 bit. +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) -#if !defined(__i386__) - , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), +#endif + : [kYuvConstants]"r"(&kYuvConstants.kUVToB), + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) -#endif - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" ); } @@ -2283,26 +1648,14 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* dst_raw, int width) { -// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. -#if defined(__i386__) asm volatile ( "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" - :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), - [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); -#endif - - asm volatile ( -#if !defined(__i386__) - "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" - "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" -#endif "sub %[u_buf],%[v_buf] \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -2314,25 +1667,23 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, "movq %%xmm0," MEMACCESS([dst_raw]) " \n" "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" - "sub $0x8,%[width] \n" + "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_raw]"+r"(dst_raw), // %[dst_raw] +// TODO(fbarchard): Make width a register for 32 bit. +#if defined(__i386__) && defined(__pic__) + [width]"+m"(width) // %[width] +#else [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) -#if !defined(__i386__) - , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), +#endif + : [kYuvConstants]"r"(&kYuvConstants.kUVToB), + [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) -#endif - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" ); } @@ -2344,19 +1695,11 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB(kYuvConstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2365,13 +1708,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2383,19 +1721,11 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV411 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB(kYuvConstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2404,13 +1734,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2420,19 +1745,11 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READNV12 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB(kYuvConstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2440,11 +1757,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2454,216 +1768,20 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, int width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READNV12 - YVUTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + YUVTORGB(kYuvConstants) + STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" + : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants] // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV444 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV411 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READNV12 - YVUTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [uv_buf]"+r"(uv_buf), // %[uv_buf] - [dst_argb]"+r"(dst_argb), // %[dst_argb] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" - // Does not use r14. -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2675,20 +1793,11 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" - "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + YUVTORGB(kYuvConstants) + STOREBGRA "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2697,13 +1806,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2715,19 +1819,11 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" - "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + YUVTORGB(kYuvConstants) + STOREABGR "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2736,13 +1832,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -2754,20 +1845,11 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, asm volatile ( "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" LABELALIGN "1: \n" READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" - "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" - "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + YUVTORGB(kYuvConstants) + STORERGBA "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2776,159 +1858,233 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" - "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" - "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - LABELALIGN - "1: \n" - READYUV422 - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm2,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" - "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" - "sub $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] - [width]"+rm"(width) // %[width] - : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } #endif // HAS_I422TOARGBROW_SSSE3 -#ifdef HAS_YTOARGBROW_SSE2 -void YToARGBRow_SSE2(const uint8* y_buf, - uint8* dst_argb, - int width) { +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) \ + "vpmaddubsw " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2 \n" \ + "vpmaddubsw " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1 \n" \ + "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ + "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ + "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ + "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ + "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklbw %%ymm3,%%ymm3,%%ymm3 \n" \ + "vpmulhuw " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3 \n" \ + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +#if defined(HAS_I422TOBGRAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). +void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_bgra, + int width) { asm volatile ( - "pxor %%xmm5,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "mov $0x00100010,%%eax \n" - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "mov $0x004a004a,%%eax \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into BGRA + "vpunpcklbw %%ymm0,%%ymm1,%%ymm1 \n" // GB + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm5,%%ymm2 \n" // AR + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels + "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels + + "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" + "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOBGRAROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ARGB + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels + + "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX2 + +#if defined(HAS_I422TOABGRROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). +void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ABGR + "vpunpcklbw %%ymm1,%%ymm2,%%ymm1 \n" // RG + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm0,%%ymm2 \n" // BA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" // RGBA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" // RGBA next 8 pixels + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TOABGRROW_AVX2 + +#if defined(HAS_I422TORGBAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into RGBA + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_I422TORGBAROW_AVX2 + +#ifdef HAS_YTOARGBROW_SSE2 +void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { + asm volatile ( + "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 "movd %%eax,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" + "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16 + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" LABELALIGN "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 "movq " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x8,0) ",%0 \n" - "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" "psubusw %%xmm3,%%xmm0 \n" - "pmullw %%xmm2,%%xmm0 \n" "psrlw $6, %%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" @@ -2939,8 +2095,8 @@ void YToARGBRow_SSE2(const uint8* y_buf, "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" @@ -2950,13 +2106,58 @@ void YToARGBRow_SSE2(const uint8* y_buf, "+rm"(width) // %2 : : "memory", "cc", "eax" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif ); } #endif // HAS_YTOARGBROW_SSE2 +#ifdef HAS_YTOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { + asm volatile ( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" + + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + ); +} +#endif // HAS_YTOARGBROW_AVX2 + #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static uvec8 kShuffleMirror = { @@ -2967,38 +2168,56 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "movdqa %3,%%xmm5 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 + MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirror) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" ); } #endif // HAS_MIRRORROW_SSSE3 +#ifdef HAS_MIRRORROW_AVX2 +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" + LABELALIGN + "1: \n" + MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" + ); +} +#endif // HAS_MIRRORROW_AVX2 + #ifdef HAS_MIRRORROW_SSE2 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( - "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 + MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 "movdqa %%xmm0,%%xmm1 \n" "psllw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" @@ -3006,21 +2225,16 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { "pshuflw $0x1b,%%xmm0,%%xmm0 \n" "pshufhw $0x1b,%%xmm0,%%xmm0 \n" "pshufd $0x4e,%%xmm0,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1)",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" ); } #endif // HAS_MIRRORROW_SSE2 @@ -3035,108 +2249,119 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, intptr_t temp_width = (intptr_t)(width); asm volatile ( "movdqa %4,%%xmm1 \n" - "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" "pshufb %%xmm1,%%xmm0 \n" - "sub $8,%3 \n" "movlpd %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $8,%3 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(temp_width) // %3 : "m"(kShuffleMirrorUV) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" ); } #endif // HAS_MIRRORROW_UV_SSSE3 -#ifdef HAS_ARGBMIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static uvec8 kARGBShuffleMirror = { - 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u -}; +#ifdef HAS_ARGBMIRRORROW_SSE2 -void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" - "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror) // %3 + : : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif + , "xmm0" ); } -#endif // HAS_ARGBMIRRORROW_SSSE3 +#endif // HAS_ARGBMIRRORROW_SSE2 -#ifdef HAS_SPLITUVROW_SSE2 -void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +#ifdef HAS_ARGBMIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec32 kARGBShuffleMirror_AVX2 = { + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; +void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = (intptr_t)(width); asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "vmovdqu %3,%%ymm5 \n" + LABELALIGN + "1: \n" + VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm5" + ); +} +#endif // HAS_ARGBMIRRORROW_AVX2 + +#ifdef HAS_SPLITUVROW_AVX2 +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%3 \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm2 \n" + "vpsrlw $0x8,%%ymm1,%%ymm3 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%3 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } +#endif // HAS_SPLITUVROW_AVX2 -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { +#ifdef HAS_SPLITUVROW_SSE2 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3164,52 +2389,46 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } #endif // HAS_SPLITUVROW_SSE2 -#ifdef HAS_MERGEUVROW_SSE2 -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +#ifdef HAS_MERGEUVROW_AVX2 +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "lea " MEMLEA(0x20,2) ",%2 \n" - "sub $0x10,%3 \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" + "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" + "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" + "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" + "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x20,%3 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2" ); } +#endif // HAS_MERGEUVROW_AVX2 -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { +#ifdef HAS_MERGEUVROW_SSE2 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { asm volatile ( "sub %0,%1 \n" LABELALIGN @@ -3230,13 +2449,8 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, "+r"(dst_uv), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2" ); } #endif // HAS_MERGEUVROW_SSE2 @@ -3246,11 +2460,11 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" "jg 1b \n" @@ -3259,30 +2473,36 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { "+r"(count) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1" -#endif ); } #endif // HAS_COPYROW_SSE2 -#ifdef HAS_COPYROW_X86 -void CopyRow_X86(const uint8* src, uint8* dst, int width) { - size_t width_tmp = (size_t)(width); +#ifdef HAS_COPYROW_AVX +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { asm volatile ( - "shr $0x2,%2 \n" - "rep movsl " MEMMOVESTRING(0,1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 : : "memory", "cc" + , "xmm0", "xmm1" ); } -#endif // HAS_COPYROW_X86 +#endif // HAS_COPYROW_AVX #ifdef HAS_COPYROW_ERMS -// Unaligned Multiple of 1. +// Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile ( @@ -3306,19 +2526,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3327,9 +2547,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "+r"(width) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_ARGBCOPYALPHAROW_SSE2 @@ -3358,9 +2576,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { "+r"(width) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" -#endif ); } #endif // HAS_ARGBCOPYALPHAROW_AVX2 @@ -3380,16 +2596,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "punpcklbw %%xmm2,%%xmm2 \n" "punpckhwd %%xmm2,%%xmm3 \n" "punpcklwd %%xmm2,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm4 \n" - "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "movdqu " MEMACCESS(1) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -3398,9 +2614,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { "+r"(width) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 @@ -3431,18 +2645,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { "+r"(width) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" -#endif ); } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -void SetRow_X86(uint8* dst, uint32 v32, int width) { - size_t width_tmp = (size_t)(width); +void SetRow_X86(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width >> 2); + const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. asm volatile ( - "shr $0x2,%1 \n" "rep stosl " MEMSTORESTRING(eax,0) " \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 @@ -3450,19 +2662,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) { : "memory", "cc"); } -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - size_t width_tmp = (size_t)(width); - uint32* d = (uint32*)(dst); - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(d), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); - dst += dst_stride; - } +void SetRow_ERMS(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosb " MEMSTORESTRING(al,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); +} + +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 @@ -3473,13 +2690,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" @@ -3488,9 +2705,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { "+r"(pix) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" -#endif ); } @@ -3502,11 +2717,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "sub %1,%2 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" @@ -3519,7 +2733,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" @@ -3529,13 +2742,8 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -3546,120 +2754,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, - int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 - MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" @@ -3673,7 +2767,6 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" @@ -3683,13 +2776,8 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" ); } @@ -3697,24 +2785,22 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1" -#endif ); } @@ -3725,121 +2811,9 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void UYVYToUV422Row_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN - MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(pix) // %3 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - asm volatile ( - LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" @@ -3854,7 +2828,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" @@ -3864,18 +2837,13 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"((intptr_t)(stride_uyvy)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -3894,7 +2862,6 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0," MEMACCESS(1) " \n" - BUNDLEALIGN MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" @@ -3904,17 +2871,218 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" ); } #endif // HAS_YUY2TOYROW_SSE2 +#ifdef HAS_YUY2TOYROW_AVX2 +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} + +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_yuy2)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" + , "xmm0", "xmm1", "xmm5" + ); +} +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : "r"((intptr_t)(stride_uyvy)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} + +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" + ); +} +#endif // HAS_YUY2TOYROW_AVX2 + #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time. void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, @@ -3956,9 +3124,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 10b \n" "19: \n" @@ -3988,9 +3156,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jge 41b \n" "49: \n" @@ -4019,9 +3187,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -4030,9 +3198,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "+r"(width) // %3 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif ); } #endif // HAS_ARGBBLENDROW_SSE2 @@ -4091,49 +3257,18 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 10b \n" "19: \n" "add $1-4,%3 \n" "jl 49f \n" - "test $0xf,%0 \n" - "jne 41f \n" - "test $0xf,%1 \n" - "jne 41f \n" // 4 pixel loop. LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "jge 40b \n" - "jmp 49f \n" - - // 4 pixel unaligned loop. - LABELALIGN - "41: \n" "movdqu " MEMACCESS(0) ",%%xmm3 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" @@ -4152,10 +3287,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" - "jge 41b \n" + "sub $0x4,%3 \n" + "jge 40b \n" "49: \n" "add $0x3,%3 \n" @@ -4181,9 +3316,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -4192,16 +3327,13 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "+r"(width) // %3 : "m"(kShuffleAlpha) // %4 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif ); } #endif // HAS_ARGBBLENDROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" @@ -4212,17 +3344,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "pand %%xmm4,%%xmm2 \n" @@ -4230,18 +3362,16 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "packuswb %%xmm1,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_ARGBATTENUATEROW_SSE2 @@ -4249,14 +3379,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha static uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u }; static uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u }; // Attenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm3,%%xmm3 \n" @@ -4284,9 +3413,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4294,16 +3423,56 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { : "m"(kShuffleAlpha0), // %3 "m"(kShuffleAlpha1) // %4 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_ARGBATTENUATEROW_SSSE3 +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u +}; +// Attenuate 8 pixels at a time. +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { uintptr_t alpha = 0; @@ -4324,7 +3493,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "movdqu " MEMACCESS(0) ",%%xmm1 \n" "movzb " MEMACCESS2(0x0b,0) ",%3 \n" "punpckhbw %%xmm1,%%xmm1 \n" - BUNDLEALIGN MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 "movzb " MEMACCESS2(0x0f,0) ",%3 \n" MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 @@ -4334,26 +3502,90 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pmulhuw %%xmm2,%%xmm1 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 "+r"(alpha) // %3 : "r"(fixed_invtbl8) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_ARGBUNATTENUATEROW_SSE2 +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u +}; +// Unattenuate 8 pixels at a time. +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "movzb " MEMACCESS2(0x13,0) ",%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x17,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x1b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x1f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) + "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_ARGBUNATTENUATEROW_AVX2 + #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { @@ -4364,16 +3596,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" @@ -4385,10 +3617,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4396,9 +3628,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { : "m"(kARGBToYJ), // %3 "m"(kAddYJ64) // %4 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_ARGBGRAYROW_SSSE3 @@ -4430,30 +3660,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm5 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4462,10 +3692,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" - "sub $0x8,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4473,9 +3703,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "m"(kARGBToSepiaG), // %3 "m"(kARGBToSepiaR) // %4 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif ); } #endif // HAS_ARGBSEPIAROW_SSSE3 @@ -4495,12 +3723,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm7 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm7,%%xmm0 \n" @@ -4510,13 +3738,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm7 \n" "phaddsw %%xmm7,%%xmm1 \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm7 \n" "phaddsw %%xmm7,%%xmm6 \n" @@ -4528,27 +3756,24 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "movdqa %%xmm0,%%xmm6 \n" "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif ); } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -// aligned to 16 bytes void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( @@ -4568,23 +3793,23 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS(0) ",%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" "paddw %%xmm4,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" - "sub $0x4,%1 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x4,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4592,16 +3817,13 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "r"(interval_size), // %3 "r"(interval_offset) // %4 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif ); } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( @@ -4612,7 +3834,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, // 4 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4622,18 +3844,16 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" -#endif ); } #endif // HAS_ARGBSHADEROW_SSE2 @@ -4643,7 +3863,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN @@ -4661,9 +3881,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4671,12 +3891,49 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "+r"(width) // %3 : : "memory", "cc" -#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_ARGBMULTIPLYROW_SSE2 + +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__AVX2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif ); } -#endif // HAS_ARGBMULTIPLYROW_SSE2 +#endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. @@ -4691,9 +3948,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4701,13 +3958,39 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "+r"(width) // %3 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1" -#endif ); } #endif // HAS_ARGBADDROW_SSE2 +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0" + ); +} +#endif // HAS_ARGBADDROW_AVX2 + #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, @@ -4721,9 +4004,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "psubusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4731,13 +4014,39 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "+r"(width) // %3 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1" -#endif ); } #endif // HAS_ARGBSUBTRACTROW_SSE2 +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels, 8 pixels at a time. +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" + , "xmm0" + ); +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + #ifdef HAS_SOBELXROW_SSE2 // SobelX as a matrix is // -1 0 1 @@ -4759,13 +4068,11 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" - BUNDLEALIGN MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" - BUNDLEALIGN MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 "punpcklbw %%xmm5,%%xmm2 \n" @@ -4778,10 +4085,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%4 \n" - BUNDLEALIGN MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -4789,13 +4095,8 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, "+r"(dst_sobelx), // %3 "+r"(width) // %4 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } #endif // HAS_SOBELXROW_SSE2 @@ -4820,13 +4121,11 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" - BUNDLEALIGN "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" - BUNDLEALIGN "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 "punpcklbw %%xmm5,%%xmm2 \n" @@ -4839,23 +4138,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%3 \n" - BUNDLEALIGN MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%3 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } #endif // HAS_SOBELYROW_SSE2 @@ -4876,8 +4169,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -4893,25 +4186,20 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpckhwd %%xmm0,%%xmm0 \n" "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm1," MEMACCESS(2) " \n" - "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm1," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } #endif // HAS_SOBELROW_SSE2 @@ -4928,26 +4216,21 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1" ); } #endif // HAS_SOBELTOPLANEROW_SSE2 @@ -4967,8 +4250,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" @@ -4984,25 +4267,20 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "movdqa %%xmm1,%%xmm7 \n" "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "sub $0x10,%3 \n" - "movdqa %%xmm6," MEMACCESS(2) " \n" - "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" - "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" - "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "movdqu %%xmm6," MEMACCESS(2) " \n" + "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_SOBELXYROW_SSE2 @@ -5035,22 +4313,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqa " MEMACCESS(2) ",%%xmm2 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" + "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" - "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" + "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" - "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n" "lea " MEMLEA(0x40,2) ",%2 \n" "paddd %%xmm0,%%xmm5 \n" - "movdqa %%xmm2," MEMACCESS(1) " \n" - "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" - "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" - "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -5082,9 +4360,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "+r"(width) // %3 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif ); } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 @@ -5115,11 +4391,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel small loop \n" LABELALIGN "4: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - BUNDLEALIGN + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 @@ -5129,7 +4404,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - BUNDLEALIGN MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 @@ -5149,11 +4423,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 4 pixel loop \n" LABELALIGN "40: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" - "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" - BUNDLEALIGN + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 @@ -5163,7 +4436,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" - BUNDLEALIGN MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 @@ -5196,11 +4468,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop \n" LABELALIGN "10: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 "lea " MEMLEA(0x10,0) ",%0 \n" "psubd " MEMACCESS(1) ",%%xmm0 \n" - BUNDLEALIGN MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 "lea " MEMLEA(0x10,1) ",%1 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" @@ -5219,13 +4490,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, "+rm"(count) // %3 : "r"((intptr_t)(width)), // %4 "rm"(area) // %5 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -5268,7 +4534,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "pshufd $0x39,%%xmm0,%%xmm0 \n" "movd %%xmm0,%k5 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" - BUNDLEALIGN MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 "punpckldq %%xmm6,%%xmm1 \n" @@ -5277,14 +4542,13 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "movd %%xmm0,%k1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movd %%xmm0,%k5 \n" - BUNDLEALIGN MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 "punpckldq %%xmm6,%%xmm0 \n" "addps %%xmm4,%%xmm3 \n" - "sub $0x4,%4 \n" "movq %%xmm0," MEMACCESS2(0x08,2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" "jge 40b \n" "49: \n" @@ -5299,11 +4563,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "pmaddwd %%xmm5,%%xmm0 \n" "addps %%xmm7,%%xmm2 \n" "movd %%xmm0,%k1 \n" - BUNDLEALIGN MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - "sub $0x1,%4 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x04,2) ",%2 \n" + "sub $0x1,%4 \n" "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 @@ -5313,13 +4576,8 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "+rm"(width), // %4 "+r"(temp) // %5 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_ARGBAFFINEROW_SSE2 @@ -5352,8 +4610,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // General purpose row blend. LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm2) + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" @@ -5362,61 +4620,57 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" // Blend 25 / 75. LABELALIGN "25: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 25b \n" "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 50b \n" "jmp 99f \n" // Blend 75 / 25. LABELALIGN "75: \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm0) + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 75b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 100b \n" "99: \n" @@ -5425,17 +4679,113 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm5" ); } #endif // HAS_INTERPOLATEROW_SSSE3 +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 32x2 -> 32x1 +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + asm volatile ( + "shr %3 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "sub %1,%0 \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" + "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" + "vpxor %%ymm0,%%ymm0,%%ymm0 \n" + "vpermd %%ymm5,%%ymm0,%%ymm5 \n" + + // General purpose row blend. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm2) + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + // Blend 25 / 75. + LABELALIGN + "25: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + LABELALIGN + "50: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" + VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0 + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + LABELALIGN + "75: \n" + "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + LABELALIGN + "100: \n" + "rep movsb " MEMMOVESTRING(1,0) " \n" + "jmp 999f \n" + + "99: \n" + "vzeroupper \n" + "999: \n" + : "+D"(dst_ptr), // %0 + "+S"(src_ptr), // %1 + "+c"(dst_width), // %2 + "+r"(source_y_fraction) // %3 + : "r"((intptr_t)(src_stride)) // %4 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm5" + ); +} +#endif // HAS_INTERPOLATEROW_AVX2 + #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, @@ -5465,8 +4815,8 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, // General purpose row blend. LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" @@ -5482,242 +4832,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqa " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSE2 - -#ifdef HAS_INTERPOLATEROW_SSSE3 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) - "movdqu %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif - ); -} -#endif // HAS_INTERPOLATEROW_SSSE3 - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" @@ -5728,10 +4845,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 25b \n" "jmp 99f \n" @@ -5741,10 +4857,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 50b \n" "jmp 99f \n" @@ -5755,10 +4870,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - BUNDLEALIGN MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 75b \n" "jmp 99f \n" @@ -5766,9 +4880,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, LABELALIGN "100: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 100b \n" "99: \n" @@ -5777,73 +4891,12 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_INTERPOLATEROW_SSE2 -#ifdef HAS_HALFROW_SSE2 -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - asm volatile ( - "sub %0,%1 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 - "sub $0x10,%2 \n" - MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) - "lea " MEMLEA(0x10,0) ",%0 \n" - "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(pix) // %2 - : "r"((intptr_t)(src_uv_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0" -#endif - ); -} -#endif // HAS_HALFROW_SSE2 - -#ifdef HAS_ARGBTOBAYERROW_SSSE3 -void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - asm volatile ( - // NaCL caveat - assumes movd is from GPR - "movd %3,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "sub $0x8,%2 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : "g"(selector) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} -#endif // HAS_ARGBTOBAYERROW_SSSE3 - #ifdef HAS_ARGBTOBAYERGGROW_SSE2 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -5852,8 +4905,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "psrld $0x18,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x8,%%xmm0 \n" "psrld $0x8,%%xmm1 \n" @@ -5861,18 +4914,16 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "pand %%xmm5,%%xmm1 \n" "packssdw %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x8,%2 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" -#endif ); } #endif // HAS_ARGBTOBAYERGGROW_SSE2 @@ -5882,34 +4933,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(shuffler) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - asm volatile ( - "movdqa " MEMACCESS(3) ",%%xmm5 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" @@ -5917,19 +4941,17 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" - "sub $0x8,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "r"(shuffler) // %3 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" -#endif ); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 @@ -5947,19 +4969,18 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x40,0) ",%0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "sub $0x10,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "r"(shuffler) // %3 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" -#endif ); } #endif // HAS_ARGBSHUFFLEROW_AVX2 @@ -5989,7 +5010,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "movzb " MEMACCESS2(0x1,4) ",%2 \n" MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 "mov %b2," MEMACCESS2(0x1,1) " \n" - BUNDLEALIGN "movzb " MEMACCESS2(0x2,4) ",%2 \n" MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 "mov %b2," MEMACCESS2(0x2,1) " \n" @@ -6014,9 +5034,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x1b,%%xmm1,%%xmm1 \n" "pshuflw $0x1b,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 123b \n" "jmp 99f \n" @@ -6032,9 +5052,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x39,%%xmm1,%%xmm1 \n" "pshuflw $0x39,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 321b \n" "jmp 99f \n" @@ -6050,9 +5070,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x93,%%xmm1,%%xmm1 \n" "pshuflw $0x93,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 2103b \n" "jmp 99f \n" @@ -6068,9 +5088,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0xc6,%%xmm1,%%xmm1 \n" "pshuflw $0xc6,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 3012b \n" "99: \n" @@ -6079,13 +5099,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "+d"(pixel_temp), // %2 "+r"(pix) // %3 : "r"(shuffler) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm5" ); } #endif // HAS_ARGBSHUFFLEROW_SSE2 @@ -6119,13 +5134,8 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, "+r"(dst_frame), // %3 "+rm"(width) // %4 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" ); } #endif // HAS_I422TOYUY2ROW_SSE2 @@ -6159,13 +5169,8 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, "+r"(dst_frame), // %3 "+rm"(width) // %4 : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" ); } #endif // HAS_I422TOUYVYROW_SSE2 @@ -6212,18 +5217,16 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, "cvttps2dq %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x2,%2 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(poly) // %3 : "memory", "cc" -#if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif ); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 @@ -6253,20 +5256,17 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "sub $0x2,%2 \n" "vmovq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(poly) // %3 - : "memory", "cc" -#if defined(__SSE2__) -// TODO(fbarchard): declare ymm usage when applicable. - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 @@ -6376,7 +5376,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "movzb " MEMACCESS2(0x4,2) ",%0 \n" MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 "mov %b0," MEMACCESS2(0x4,3) " \n" - BUNDLEALIGN "movzb " MEMACCESS2(0x5,2) ",%0 \n" MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 "mov %b0," MEMACCESS2(0x5,3) " \n" @@ -6416,9 +5415,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "mov %b0," MEMACCESS2(0xe,3) " \n" "movzb " MEMACCESS2(0xf,2) ",%0 \n" "mov %b0," MEMACCESS2(0xf,3) " \n" - "sub $0x4,%4 \n" "lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,3) ",%3 \n" + "sub $0x4,%4 \n" "jg 1b \n" : "+d"(pixel_temp), // %0 "+a"(table_temp), // %1 @@ -6427,10 +5426,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "+rm"(width) // %4 : "r"(luma), // %5 "rm"(lumacoeff) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm3", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index d79c35396..5c06b6078 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -24,55 +24,63 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ (defined(_M_IX86) || defined(_M_X64)) -#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ -#define UB 127 /* min(127,(int8)(2.018 * 64)) */ -#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ -#define UR 0 +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ -#define VB 0 -#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ -#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) -// Bias -#define BB UB * 128 + VB * 128 -#define BG UG * 128 + VG * 128 -#define BR UR * 128 + VR * 128 - -static const vec8 kUVToB = { - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +struct YuvConstants { + lvec8 kUVToB; // 0 + lvec8 kUVToG; // 32 + lvec8 kUVToR; // 64 + lvec16 kUVBiasB; // 96 + lvec16 kUVBiasG; // 128 + lvec16 kUVBiasR; // 160 + lvec16 kYToRgb; // 192 }; -static const vec8 kUVToR = { - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +// BT601 constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvConstants) = { + { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, + UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, + { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, + 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } }; -static const vec8 kUVToG = { - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +// BT601 constants for NV21 where chroma plane is VU instead of UV. +static YuvConstants SIMD_ALIGNED(kYvuConstants) = { + { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, + 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, + VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, + { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, + { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, + { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, + { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } }; -static const vec8 kVUToB = { - VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, -}; - -static const vec8 kVUToR = { - VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, -}; - -static const vec8 kVUToG = { - VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, -}; - -static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; -static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; -static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; -static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; -static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; - // 64 bit #if defined(_M_X64) -// Aligned destination version. __declspec(align(16)) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -81,7 +89,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, int width) { __m128i xmm0, xmm1, xmm2, xmm3; const __m128i xmm5 = _mm_set1_epi8(-1); - const __m128i xmm4 = _mm_setzero_si128(); const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; while (width > 0) { @@ -89,18 +96,17 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); + xmm1 = _mm_loadu_si128(&xmm0); + xmm2 = _mm_loadu_si128(&xmm0); + xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); + xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); + xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); + xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); + xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); + xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); xmm3 = _mm_loadl_epi64((__m128i*)y_buf); - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); + xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); + xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); xmm0 = _mm_adds_epi16(xmm0, xmm3); xmm1 = _mm_adds_epi16(xmm1, xmm3); xmm2 = _mm_adds_epi16(xmm2, xmm3); @@ -112,60 +118,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, xmm2 = _mm_packus_epi16(xmm2, xmm2); xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); - xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); - - _mm_store_si128((__m128i *)dst_argb, xmm0); - _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); - - y_buf += 8; - u_buf += 4; - dst_argb += 32; - width -= 8; - } -} - -// Unaligned destination version. -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __m128i xmm0, xmm1, xmm2, xmm3; - const __m128i xmm5 = _mm_set1_epi8(-1); - const __m128i xmm4 = _mm_setzero_si128(); - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; - - while (width > 0) { - xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); - xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); - xmm1 = _mm_load_si128(&xmm0); - xmm2 = _mm_load_si128(&xmm0); - xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); - xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); - xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); - xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); - xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); - xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); - xmm3 = _mm_loadl_epi64((__m128i*)y_buf); - xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); - xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); - xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); - xmm0 = _mm_adds_epi16(xmm0, xmm3); - xmm1 = _mm_adds_epi16(xmm1, xmm3); - xmm2 = _mm_adds_epi16(xmm2, xmm3); - xmm0 = _mm_srai_epi16(xmm0, 6); - xmm1 = _mm_srai_epi16(xmm1, 6); - xmm2 = _mm_srai_epi16(xmm2, 6); - xmm0 = _mm_packus_epi16(xmm0, xmm0); - xmm1 = _mm_packus_epi16(xmm1, xmm1); - xmm2 = _mm_packus_epi16(xmm2, xmm2); - xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); - xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); - xmm1 = _mm_load_si128(&xmm0); + xmm1 = _mm_loadu_si128(&xmm0); xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); @@ -178,6 +131,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, width -= 8; } } + // 32 bit #else // defined(_M_X64) @@ -209,15 +163,10 @@ static const vec8 kARGBToVJ = { -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 }; -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = { - 0, 4, 1, 5, 2, 6, 3, 7 -}; - // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; // Constants for BGRA. @@ -263,6 +212,7 @@ static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +// 7 bit fixed point 0.5. static const vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; @@ -316,36 +266,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm5 - por xmm1, xmm5 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, - int pix) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - - align 4 convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] @@ -374,7 +294,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { pslld xmm5, 24 movdqa xmm4, kShuffleMaskRGB24ToARGB - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -386,18 +305,18 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 - sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] + sub ecx, 16 jg convertloop ret } @@ -414,7 +333,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, pslld xmm5, 24 movdqa xmm4, kShuffleMaskRAWToARGB - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -426,18 +344,18 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 - movdqa [edx + 32], xmm2 + movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 - movdqa [edx + 16], xmm1 + movdqu [edx + 16], xmm1 por xmm3, xmm5 - sub ecx, 16 - movdqa [edx + 48], xmm3 + movdqu [edx + 48], xmm3 lea edx, [edx + 64] + sub ecx, 16 jg convertloop ret } @@ -474,7 +392,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, sub edx, eax sub edx, eax - align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 @@ -491,8 +408,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -524,7 +441,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, sub edx, eax sub edx, eax - align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 @@ -545,8 +461,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 - movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -570,7 +486,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, sub edx, eax sub edx, eax - align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 @@ -585,8 +500,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB - movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop @@ -602,7 +517,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { mov ecx, [esp + 12] // pix movdqa xmm6, kShuffleMaskARGBToRGB24 - align 4 convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] @@ -641,7 +555,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { mov ecx, [esp + 12] // pix movdqa xmm6, kShuffleMaskARGBToRAW - align 4 convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] @@ -686,9 +599,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 - align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R @@ -726,9 +638,8 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 - align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G movdqa xmm3, xmm0 // R @@ -764,14 +675,13 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 - align 4 convertloop: - movdqa xmm0, [eax] // fetch 4 pixels of argb + movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble - psrl xmm0, 4 - psrl xmm1, 8 + psrld xmm0, 4 + psrld xmm1, 8 por xmm0, xmm1 packuswb xmm0, xmm0 lea eax, [eax + 16] @@ -783,6 +693,116 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } +#ifdef HAS_ARGBTORGB565ROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 + vpslld ymm5, ymm5, 11 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpslld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpsrad ymm0, ymm0, 16 // R + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565ROW_AVX2 + +#ifdef HAS_ARGBTOARGB1555ROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm4, ymm4, ymm4 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpslld ymm7, ymm7, 15 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB1555ROW_AVX2 + +#ifdef HAS_ARGBTOARGB4444ROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpsllw ymm4, ymm4, 12 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpand ymm1, ymm0, ymm4 // high nibble + vpand ymm0, ymm0, ymm3 // low nibble + vpsrld ymm1, ymm1, 8 + vpsrld ymm0, ymm0, 4 + vpor ymm0, ymm0, ymm1 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB4444ROW_AVX2 + // Convert 16 ARGB pixels (64 bytes) to 16 Y values. __declspec(naked) __declspec(align(16)) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { @@ -790,15 +810,14 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY + movdqa xmm5, kAddY16 - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -810,15 +829,16 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } } -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. +// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. __declspec(naked) __declspec(align(16)) void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -828,12 +848,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 @@ -846,15 +865,20 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } } #ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) __declspec(align(32)) void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { @@ -864,9 +888,8 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov ecx, [esp + 12] /* pix */ vbroadcastf128 ymm4, kARGBToY vbroadcastf128 ymm5, kAddY16 - vmovdqa ymm6, kPermdARGBToY_AVX + vmovdqu ymm6, kPermdARGBToY_AVX - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -883,10 +906,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 - sub ecx, 32 + vpaddb ymm0, ymm0, ymm5 // add 16 for Y vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -904,9 +927,8 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov ecx, [esp + 12] /* pix */ vbroadcastf128 ymm4, kARGBToYJ vbroadcastf128 ymm5, kAddYJ64 - vmovdqa ymm6, kPermdARGBToY_AVX + vmovdqu ymm6, kPermdARGBToY_AVX - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -925,9 +947,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper @@ -936,119 +958,15 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) __declspec(align(16)) -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 - movdqa xmm4, kARGBToY - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToYJ - movdqa xmm5, kAddYJ64 - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - paddw xmm0, xmm5 - paddw xmm2, xmm5 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - __declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 movdqa xmm4, kBGRAToY - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 - movdqa xmm4, kBGRAToY - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -1065,9 +983,9 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -1079,44 +997,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 movdqa xmm4, kABGRToY - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 - movdqa xmm4, kABGRToY - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -1133,9 +1016,9 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -1147,44 +1030,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm5, kAddY16 movdqa xmm4, kRGBAToY - - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm4 - pmaddubsw xmm1, xmm4 - pmaddubsw xmm2, xmm4 - pmaddubsw xmm3, xmm4 - lea eax, [eax + 64] - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psrlw xmm0, 7 - psrlw xmm2, 7 - packuswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ movdqa xmm5, kAddY16 - movdqa xmm4, kRGBAToY - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -1201,9 +1049,9 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -1220,22 +1068,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU sub edi, edx // stride from u to v - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1263,10 +1115,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1286,22 +1138,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToUJ - movdqa xmm6, kARGBToVJ movdqa xmm5, kAddUVJ128 + movdqa xmm6, kARGBToVJ + movdqa xmm7, kARGBToUJ sub edi, edx // stride from u to v - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1330,10 +1186,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, packsswb xmm0, xmm1 // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1359,7 +1215,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vbroadcastf128 ymm7, kARGBToU sub edi, edx // stride from u to v - align 4 convertloop: /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] @@ -1395,10 +1250,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values - sub ecx, 32 vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] + sub ecx, 32 jg convertloop pop edi @@ -1409,147 +1264,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_AVX2 -__declspec(naked) __declspec(align(16)) -void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kARGBToUJ - movdqa xmm6, kARGBToVJ - movdqa xmm5, kAddUVJ128 - sub edi, edx // stride from u to v - - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] - movdqu xmm4, [eax + esi] - pavgb xmm0, xmm4 - movdqu xmm4, [eax + esi + 16] - pavgb xmm1, xmm4 - movdqu xmm4, [eax + esi + 32] - pavgb xmm2, xmm4 - movdqu xmm4, [eax + esi + 48] - pavgb xmm3, xmm4 - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - paddw xmm0, xmm5 // +.5 rounding -> unsigned - paddw xmm1, xmm5 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - __declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { @@ -1559,70 +1273,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU sub edi, edx // stride from u to v - align 4 - convertloop: - /* convert to U and V */ - movdqa xmm0, [eax] // U - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 - - movdqa xmm0, [eax] // V - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm6 - pmaddubsw xmm1, xmm6 - pmaddubsw xmm2, xmm6 - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm1 - phaddw xmm2, xmm3 - psraw xmm0, 8 - psraw xmm2, 8 - packsswb xmm0, xmm2 - paddb xmm0, xmm5 - lea eax, [eax + 64] - movdqa [edx + edi], xmm0 - lea edx, [edx + 16] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 convertloop: /* convert to U and V */ movdqu xmm0, [eax] // U @@ -1639,7 +1294,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, psraw xmm2, 8 packsswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 movdqu xmm0, [eax] // V @@ -1659,6 +1313,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, lea eax, [eax + 64] movdqu [edx + edi], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop pop edi @@ -1675,71 +1330,11 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV movdqa xmm5, kAddUV128 + movdqa xmm6, kARGBToV + movdqa xmm7, kARGBToU sub edi, edx // stride from u to v - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_argb - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - movdqa xmm7, kARGBToU - movdqa xmm6, kARGBToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1773,10 +1368,10 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1795,92 +1390,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kBGRAToU - movdqa xmm6, kBGRAToV movdqa xmm5, kAddUV128 + movdqa xmm6, kBGRAToV + movdqa xmm7, kBGRAToU sub edi, edx // stride from u to v - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kBGRAToU - movdqa xmm6, kBGRAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1908,10 +1437,10 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1931,92 +1460,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kABGRToU - movdqa xmm6, kABGRToV movdqa xmm5, kAddUV128 + movdqa xmm6, kABGRToV + movdqa xmm7, kABGRToU sub edi, edx // stride from u to v - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kABGRToU - movdqa xmm6, kABGRToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2044,10 +1507,10 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -2067,92 +1530,26 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kRGBAToU - movdqa xmm6, kRGBAToV movdqa xmm5, kAddUV128 + movdqa xmm6, kRGBAToV + movdqa xmm7, kRGBAToU sub edi, edx // stride from u to v - align 4 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] - lea eax, [eax + 64] - movdqa xmm4, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm4, xmm1, 0xdd - pavgb xmm0, xmm4 - movdqa xmm4, xmm2 - shufps xmm2, xmm3, 0x88 - shufps xmm4, xmm3, 0xdd - pavgb xmm2, xmm4 - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm6 // V - pmaddubsw xmm3, xmm6 - phaddw xmm0, xmm2 - phaddw xmm1, xmm3 - psraw xmm0, 8 - psraw xmm1, 8 - packsswb xmm0, xmm1 - paddb xmm0, xmm5 // -> unsigned - - // step 3 - store 8 U and 8 V values - sub ecx, 16 - movlps qword ptr [edx], xmm0 // U - movhps qword ptr [edx + edi], xmm0 // V - lea edx, [edx + 8] - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, kRGBAToU - movdqa xmm6, kRGBAToV - movdqa xmm5, kAddUV128 - sub edi, edx // stride from u to v - - align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + 32] - movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 + movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 + movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 + movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -2180,10 +1577,10 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -2193,36 +1590,68 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 __asm { \ + __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ + __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 8] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + } + +// Read 8 UV from NV12, upsample to 16 UV. +#define READNV12_AVX2 __asm { \ + __asm vmovdqu xmm0, [esi] /* UV */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + } + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2(YuvConstants) __asm { \ + /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ + __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ + __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ + __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ + __asm vpsubw ymm2, ymm3, ymm2 \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ + __asm vpsubw ymm1, ymm3, ymm1 \ + __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ + __asm vpsubw ymm0, ymm3, ymm0 \ + /* Step 2: Find Y contribution to 16 R,G,B values */ \ + __asm vmovdqu xmm3, [eax] /* NOLINT */ \ + __asm lea eax, [eax + 16] \ + __asm vpermq ymm3, ymm3, 0xd8 \ + __asm vpunpcklbw ymm3, ymm3, ymm3 \ + __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ + __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ + __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ + __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ + __asm vpsraw ymm0, ymm0, 6 \ + __asm vpsraw ymm1, ymm1, 6 \ + __asm vpsraw ymm2, ymm2, 6 \ + __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ + __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ + __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ + } + +// Store 16 ARGB values. +#define STOREARGB_AVX2 __asm { \ + /* Step 3: Weave into ARGB */ \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ + __asm vpermq ymm2, ymm2, 0xd8 \ + __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ + __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ + __asm vmovdqu [edx], ymm1 \ + __asm vmovdqu [edx + 32], ymm0 \ + __asm lea edx, [edx + 64] \ + } + #ifdef HAS_I422TOARGBROW_AVX2 - -static const lvec8 kUVToB_AVX = { - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, - UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB -}; -static const lvec8 kUVToR_AVX = { - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, - UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR -}; -static const lvec8 kUVToG_AVX = { - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, - UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG -}; -static const lvec16 kYToRgb_AVX = { - YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG -}; -static const lvec16 kYSub16_AVX = { - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 -}; -static const lvec16 kUVBiasB_AVX = { - BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB -}; -static const lvec16 kUVBiasG_AVX = { - BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG -}; -static const lvec16 kUVBiasR_AVX = { - BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR -}; - // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) __declspec(align(16)) @@ -2241,63 +1670,218 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, mov ecx, [esp + 8 + 20] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - vpxor ymm4, ymm4, ymm4 - align 4 convertloop: - vmovq xmm0, qword ptr [esi] // U - vmovq xmm1, qword ptr [esi + edi] // V - lea esi, [esi + 8] - vpunpcklbw ymm0, ymm0, ymm1 // UV - vpermq ymm0, ymm0, 0xd8 - vpunpcklwd ymm0, ymm0, ymm0 // UVUV - vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV - vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV - vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV - vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed - vpsubw ymm1, ymm1, kUVBiasG_AVX - vpsubw ymm0, ymm0, kUVBiasR_AVX + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 - // Step 2: Find Y contribution to 16 R,G,B values - vmovdqu xmm3, [eax] // NOLINT - lea eax, [eax + 16] - vpermq ymm3, ymm3, 0xd8 - vpunpcklbw ymm3, ymm3, ymm4 - vpsubsw ymm3, ymm3, kYSub16_AVX - vpmullw ymm3, ymm3, kYToRgb_AVX - vpaddsw ymm2, ymm2, ymm3 // B += Y - vpaddsw ymm1, ymm1, ymm3 // G += Y - vpaddsw ymm0, ymm0, ymm3 // R += Y - vpsraw ymm2, ymm2, 6 - vpsraw ymm1, ymm1, 6 - vpsraw ymm0, ymm0, 6 - vpackuswb ymm2, ymm2, ymm2 // B - vpackuswb ymm1, ymm1, ymm1 // G - vpackuswb ymm0, ymm0, ymm0 // R - - // Step 3: Weave into ARGB - vpunpcklbw ymm2, ymm2, ymm1 // BG - vpermq ymm2, ymm2, 0xd8 - vpunpcklbw ymm0, ymm0, ymm5 // RA - vpermq ymm0, ymm0, 0xd8 - vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels - vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels - vmovdqu [edx], ymm1 - vmovdqu [edx + 32], ymm2 - lea edx, [edx + 64] sub ecx, 16 jg convertloop - vzeroupper pop edi pop esi + vzeroupper ret } } #endif // HAS_I422TOARGBROW_AVX2 -#ifdef HAS_I422TOARGBROW_SSSE3 +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_AVX2(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} + +// 16 pixels. +// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_AVX2(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READNV12_AVX2 + YUVTORGB_AVX2(kYvuConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop esi + ret + } +} + +#ifdef HAS_I422TOBGRAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) __declspec(align(16)) +void I422ToBGRARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into BGRA + vpunpcklbw ymm1, ymm1, ymm0 // GB + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm5, ymm2 // AR + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels + vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOBGRAROW_AVX2 + +#ifdef HAS_I422TORGBAROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into RGBA + vpunpcklbw ymm1, ymm1, ymm2 // GR + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm5, ymm0 // AB + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels + vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TORGBAROW_AVX2 + +#ifdef HAS_I422TOABGRROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). +// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ABGR + vpunpcklbw ymm1, ymm2, ymm1 // RG + vpermq ymm1, ymm1, 0xd8 + vpunpcklbw ymm2, ymm0, ymm5 // BA + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels + vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I422TOABGRROW_AVX2 + +#if defined(HAS_I422TOARGBROW_SSSE3) // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 444. @@ -2337,22 +1921,25 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, } // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB __asm { \ +#define YUVTORGB(YuvConstants) __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ - __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ - __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ - __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ - __asm psubw xmm1, kUVBiasG \ - __asm psubw xmm2, kUVBiasR \ + __asm movdqa xmm3, xmm0 \ + __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ + __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ + __asm psubw xmm0, xmm1 \ + __asm movdqa xmm1, YuvConstants.kUVBiasG \ + __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ + __asm psubw xmm1, xmm2 \ + __asm movdqa xmm2, YuvConstants.kUVBiasR \ + __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ + __asm psubw xmm2, xmm3 \ /* Step 2: Find Y contribution to 8 R,G,B values */ \ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ __asm lea eax, [eax + 8] \ - __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ - __asm pmullw xmm3, kYToRgb \ + __asm punpcklbw xmm3, xmm3 \ + __asm pmulhuw xmm3, YuvConstants.kYToRgb \ __asm paddsw xmm0, xmm3 /* B += Y */ \ __asm paddsw xmm1, xmm3 /* G += Y */ \ __asm paddsw xmm2, xmm3 /* R += Y */ \ @@ -2364,35 +1951,131 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, __asm packuswb xmm2, xmm2 /* R */ \ } -// Convert 8 pixels: 8 VU and 8 Y. -#define YVUTORGB __asm { \ - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ +// Store 8 ARGB values. +#define STOREARGB __asm { \ + /* Step 3: Weave into ARGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ - __asm movdqa xmm2, xmm0 \ - __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ - __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ - __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ - __asm psubw xmm1, kUVBiasG \ - __asm psubw xmm2, kUVBiasR \ - /* Step 2: Find Y contribution to 8 R,G,B values */ \ - __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ - __asm lea eax, [eax + 8] \ - __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ - __asm pmullw xmm3, kYToRgb \ - __asm paddsw xmm0, xmm3 /* B += Y */ \ - __asm paddsw xmm1, xmm3 /* G += Y */ \ - __asm paddsw xmm2, xmm3 /* R += Y */ \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ + __asm movdqu [edx], xmm0 \ + __asm movdqu [edx + 16], xmm1 \ + __asm lea edx, [edx + 32] \ } -// 8 pixels, dest aligned 16. +// Store 8 BGRA values. +#define STOREBGRA __asm { \ + /* Step 3: Weave into BGRA */ \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm0 /* GB */ \ + __asm punpcklbw xmm5, xmm2 /* AR */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ + __asm movdqu [edx], xmm5 \ + __asm movdqu [edx + 16], xmm0 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 ABGR values. +#define STOREABGR __asm { \ + /* Step 3: Weave into ABGR */ \ + __asm punpcklbw xmm2, xmm1 /* RG */ \ + __asm punpcklbw xmm0, xmm5 /* BA */ \ + __asm movdqa xmm1, xmm2 \ + __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ + __asm movdqu [edx], xmm2 \ + __asm movdqu [edx + 16], xmm1 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 RGBA values. +#define STORERGBA __asm { \ + /* Step 3: Weave into RGBA */ \ + __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ + __asm punpcklbw xmm1, xmm2 /* GR */ \ + __asm punpcklbw xmm5, xmm0 /* AB */ \ + __asm movdqa xmm0, xmm5 \ + __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ + __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ + __asm movdqu [edx], xmm5 \ + __asm movdqu [edx + 16], xmm0 \ + __asm lea edx, [edx + 32] \ + } + +// Store 8 RGB24 values. +#define STORERGB24 __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RGB24 */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ + __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24] \ + } + +// Store 8 RAW values. +#define STORERAW __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RAW */ \ + __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ + __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ + __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ + __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ + __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm lea edx, [edx + 24] \ + } + +// Store 8 RGB565 values. +#define STORERGB565 __asm { \ + /* Step 3: Weave into RRGB */ \ + __asm punpcklbw xmm0, xmm1 /* BG */ \ + __asm punpcklbw xmm2, xmm2 /* RR */ \ + __asm movdqa xmm1, xmm0 \ + __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ + __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ + /* Step 4: RRGB -> RGB565 */ \ + __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ + __asm movdqa xmm2, xmm0 /* G */ \ + __asm pslld xmm0, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm0, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm0, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm0, xmm3 /* BGR */ \ + __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ + __asm movdqa xmm2, xmm1 /* G */ \ + __asm pslld xmm1, 8 /* R */ \ + __asm psrld xmm3, 3 /* B */ \ + __asm psrld xmm2, 5 /* G */ \ + __asm psrad xmm1, 16 /* R */ \ + __asm pand xmm3, xmm5 /* B */ \ + __asm pand xmm2, xmm6 /* G */ \ + __asm pand xmm1, xmm7 /* R */ \ + __asm por xmm3, xmm2 /* BG */ \ + __asm por xmm1, xmm3 /* BGR */ \ + __asm packssdw xmm0, xmm1 \ + __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm lea edx, [edx + 16] \ + } + +// 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I444ToARGBRow_SSSE3(const uint8* y_buf, @@ -2410,22 +2093,12 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - align 4 convertloop: READYUV444 - YUVTORGB + YUVTORGB(kYuvConstants) + STOREARGB - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2435,8 +2108,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). __declspec(naked) __declspec(align(16)) void I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2452,27 +2125,14 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, mov edx, [esp + 8 + 16] // rgb24 mov ecx, [esp + 8 + 20] // width sub edi, esi - pxor xmm4, xmm4 movdqa xmm5, kShuffleMaskARGBToRGB24_0 movdqa xmm6, kShuffleMaskARGBToRGB24 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STORERGB24 - // Step 3: Weave into RRGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm2 // RR - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRR first 4 pixels - punpckhwd xmm1, xmm2 // BGRR next 4 pixels - pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. - pshufb xmm1, xmm6 // Pack into first 12 bytes. - palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 - movq qword ptr [edx], xmm0 // First 8 bytes - movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. - lea edx, [edx + 24] sub ecx, 8 jg convertloop @@ -2482,8 +2142,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). __declspec(naked) __declspec(align(16)) void I422ToRAWRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2499,27 +2159,14 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, mov edx, [esp + 8 + 16] // raw mov ecx, [esp + 8 + 20] // width sub edi, esi - pxor xmm4, xmm4 movdqa xmm5, kShuffleMaskARGBToRAW_0 movdqa xmm6, kShuffleMaskARGBToRAW - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STORERAW - // Step 3: Weave into RRGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm2 // RR - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRR first 4 pixels - punpckhwd xmm1, xmm2 // BGRR next 4 pixels - pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. - pshufb xmm1, xmm6 // Pack into first 12 bytes. - palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 - movq qword ptr [edx], xmm0 // First 8 bytes - movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. - lea edx, [edx + 24] sub ecx, 8 jg convertloop @@ -2529,8 +2176,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest unaligned. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// 8 pixels +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). __declspec(naked) __declspec(align(16)) void I422ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2546,7 +2193,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, mov edx, [esp + 8 + 16] // rgb565 mov ecx, [esp + 8 + 20] // width sub edi, esi - pxor xmm4, xmm4 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 @@ -2555,45 +2201,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STORERGB565 - // Step 3: Weave into RRGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm2 // RR - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRR first 4 pixels - punpckhwd xmm1, xmm2 // BGRR next 4 pixels - - // Step 3b: RRGB -> RGB565 - movdqa xmm3, xmm0 // B first 4 pixels of argb - movdqa xmm2, xmm0 // G - pslld xmm0, 8 // R - psrld xmm3, 3 // B - psrld xmm2, 5 // G - psrad xmm0, 16 // R - pand xmm3, xmm5 // B - pand xmm2, xmm6 // G - pand xmm0, xmm7 // R - por xmm3, xmm2 // BG - por xmm0, xmm3 // BGR - movdqa xmm3, xmm1 // B next 4 pixels of argb - movdqa xmm2, xmm1 // G - pslld xmm1, 8 // R - psrld xmm3, 3 // B - psrld xmm2, 5 // G - psrad xmm1, 16 // R - pand xmm3, xmm5 // B - pand xmm2, xmm6 // G - pand xmm1, xmm7 // R - por xmm3, xmm2 // BG - por xmm1, xmm3 // BGR - packssdw xmm0, xmm1 sub ecx, 8 - movdqu [edx], xmm0 // store 8 pixels of RGB565 - lea edx, [edx + 16] jg convertloop pop edi @@ -2602,7 +2215,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToARGBRow_SSSE3(const uint8* y_buf, @@ -2620,22 +2233,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STOREARGB - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2645,7 +2248,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more. __declspec(naked) __declspec(align(16)) @@ -2664,23 +2267,13 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, mov edx, [esp + 12 + 16] // argb mov ecx, [esp + 12 + 20] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - align 4 convertloop: READYUV411 // modifies EBX - YUVTORGB + YUVTORGB(kYuvConstants) + STOREARGB - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2691,7 +2284,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void NV12ToARGBRow_SSSE3(const uint8* y_buf, @@ -2705,22 +2298,12 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, mov edx, [esp + 4 + 12] // argb mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - align 4 convertloop: READNV12 - YUVTORGB + YUVTORGB(kYuvConstants) + STOREARGB - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2729,183 +2312,13 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +// 8 pixels. +// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void NV21ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // VU - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YVUTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV444 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -// 8 pixels, unaligned. -// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -// Similar to I420 but duplicate UV once more. -__declspec(naked) __declspec(align(16)) -void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // Y - mov esi, [esp + 12 + 8] // U - mov edi, [esp + 12 + 12] // V - mov edx, [esp + 12 + 16] // argb - mov ecx, [esp + 12 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV411 // modifies EBX - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - pop ebx - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { __asm { push esi mov eax, [esp + 4 + 4] // Y @@ -2913,60 +2326,12 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, mov edx, [esp + 4 + 12] // argb mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - align 4 convertloop: READNV12 - YUVTORGB + YUVTORGB(kYvuConstants) + STOREARGB - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop esi - ret - } -} - -// 8 pixels, dest aligned 16. -// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* uv_buf, - uint8* dst_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // Y - mov esi, [esp + 4 + 8] // VU - mov edx, [esp + 4 + 12] // argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READNV12 - YVUTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -2990,64 +2355,12 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, mov edx, [esp + 8 + 16] // bgra mov ecx, [esp + 8 + 20] // width sub edi, esi - pxor xmm4, xmm4 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STOREBGRA - // Step 3: Weave into BGRA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm0 // GB - punpcklbw xmm5, xmm2 // AR - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // BGRA first 4 pixels - punpckhwd xmm0, xmm1 // BGRA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_bgra, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // bgra - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into BGRA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm0 // GB - punpcklbw xmm5, xmm2 // AR - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // BGRA first 4 pixels - punpckhwd xmm0, xmm1 // BGRA next 4 pixels - movdqu [edx], xmm5 - movdqu [edx + 16], xmm0 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3073,63 +2386,12 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STOREABGR - // Step 3: Weave into ARGB - punpcklbw xmm2, xmm1 // RG - punpcklbw xmm0, xmm5 // BA - movdqa xmm1, xmm2 - punpcklwd xmm2, xmm0 // RGBA first 4 pixels - punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqa [edx], xmm2 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_abgr, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // abgr - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm2, xmm1 // RG - punpcklbw xmm0, xmm5 // BA - movdqa xmm1, xmm2 - punpcklwd xmm2, xmm0 // RGBA first 4 pixels - punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqu [edx], xmm2 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3154,64 +2416,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, mov edx, [esp + 8 + 16] // rgba mov ecx, [esp + 8 + 20] // width sub edi, esi - pxor xmm4, xmm4 - align 4 convertloop: READYUV422 - YUVTORGB + YUVTORGB(kYuvConstants) + STORERGBA - // Step 3: Weave into RGBA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm2 // GR - punpcklbw xmm5, xmm0 // AB - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // RGBA first 4 pixels - punpckhwd xmm0, xmm1 // RGBA next 4 pixels - movdqa [edx], xmm5 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_rgba, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgba - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pxor xmm4, xmm4 - - align 4 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into RGBA - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - punpcklbw xmm1, xmm2 // GR - punpcklbw xmm5, xmm0 // AB - movdqa xmm0, xmm5 - punpcklwd xmm5, xmm1 // RGBA first 4 pixels - punpckhwd xmm0, xmm1 // RGBA next 4 pixels - movdqu [edx], xmm5 - movdqu [edx + 16], xmm0 - lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3224,32 +2434,32 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2 +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void YToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width) { __asm { - pxor xmm5, xmm5 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - mov eax, 0x00100010 - movd xmm3, eax - pshufd xmm3, xmm3, 0 - mov eax, 0x004a004a // 74 + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax pshufd xmm2, xmm2,0 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + movd xmm3, eax + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width - align 4 convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] - punpcklbw xmm0, xmm5 // 0.Y + punpcklbw xmm0, xmm0 // Y.Y + pmulhuw xmm0, xmm2 psubusw xmm0, xmm3 - pmullw xmm0, xmm2 psrlw xmm0, 6 packuswb xmm0, xmm0 // G @@ -3260,23 +2470,74 @@ void YToARGBRow_SSE2(const uint8* y_buf, punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop - ret } } #endif // HAS_YTOARGBROW_SSE2 +#ifdef HAS_YTOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +__declspec(naked) __declspec(align(16)) +void YToARGBRow_AVX2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) + vmovd xmm2, eax + vbroadcastss ymm2, xmm2 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpmulhuw ymm0, ymm0, ymm2 + vpsubusw ymm0, ymm0, ymm3 + vpsrlw ymm0, ymm0, 6 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpermq ymm1, ymm1, 0xd8 + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm4 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_YTOARGBROW_AVX2 + #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; +// TODO(fbarchard): Replace lea with -16 offset. __declspec(naked) __declspec(align(16)) void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { __asm { @@ -3284,15 +2545,13 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, kShuffleMirror - lea eax, [eax - 16] - align 4 convertloop: - movdqa xmm0, [eax + ecx] + movdqu xmm0, [eax - 16 + ecx] pshufb xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -3300,29 +2559,21 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -// Shuffle table for reversing the bytes. -static const ulvec8 kShuffleMirror_AVX2 = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u -}; - __declspec(naked) __declspec(align(16)) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - vmovdqa ymm5, kShuffleMirror_AVX2 - lea eax, [eax - 32] + vbroadcastf128 ymm5, kShuffleMirror - align 4 convertloop: - vmovdqu ymm0, [eax + ecx] + vmovdqu ymm0, [eax - 32 + ecx] vpshufb ymm0, ymm0, ymm5 vpermq ymm0, ymm0, 0x4e // swap high and low halfs - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -3331,19 +2582,15 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_SSE2 -// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 -// version can not. __declspec(naked) __declspec(align(16)) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - lea eax, [eax - 16] - align 4 convertloop: - movdqu xmm0, [eax + ecx] + movdqu xmm0, [eax - 16 + ecx] movdqa xmm1, xmm0 // swap bytes psllw xmm0, 8 psrlw xmm1, 8 @@ -3351,9 +2598,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { pshuflw xmm0, xmm0, 0x1b // swap words pshufhw xmm0, xmm0, 0x1b pshufd xmm0, xmm0, 0x4e // swap qwords - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -3379,15 +2626,14 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, lea eax, [eax + ecx * 2 - 16] sub edi, edx - align 4 convertloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm1 - sub ecx, 8 movlpd qword ptr [edx], xmm0 movhpd qword ptr [edx + edi], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg convertloop pop edi @@ -3396,34 +2642,27 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, } #endif // HAS_MIRRORROW_UV_SSSE3 -#ifdef HAS_ARGBMIRRORROW_SSSE3 -// Shuffle table for reversing the bytes. -static const uvec8 kARGBShuffleMirror = { - 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u -}; - +#ifdef HAS_ARGBMIRRORROW_SSE2 __declspec(naked) __declspec(align(16)) -void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { +void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width lea eax, [eax - 16 + ecx * 4] // last 4 pixels. - movdqa xmm5, kARGBShuffleMirror - align 4 convertloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax - 16] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqa [edx], xmm0 + pshufd xmm0, xmm0, 0x1b + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret } } -#endif // HAS_ARGBMIRRORROW_SSSE3 +#endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. @@ -3437,15 +2676,13 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - lea eax, [eax - 32] - vmovdqa ymm5, kARGBShuffleMirror_AVX2 + vmovdqu ymm5, kARGBShuffleMirror_AVX2 - align 4 convertloop: - vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order - sub ecx, 8 + vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 8 jg convertloop vzeroupper ret @@ -3466,44 +2703,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { psrlw xmm5, 8 sub edi, edx - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - pand xmm0, xmm5 // even bytes - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm2, 8 // odd bytes - psrlw xmm3, 8 - packuswb xmm2, xmm3 - movdqa [edx], xmm0 - movdqa [edx + edi], xmm2 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -3526,6 +2725,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ret } } + #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 @@ -3541,7 +2741,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { vpsrlw ymm5, ymm5, 8 sub edi, edx - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3579,37 +2778,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, mov ecx, [esp + 4 + 16] // width sub edx, eax - align 4 - convertloop: - movdqa xmm0, [eax] // read 16 U's - movdqa xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqa [edi], xmm0 - movdqa [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - align 4 convertloop: movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's @@ -3641,17 +2809,16 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, mov ecx, [esp + 4 + 16] // width sub edx, eax - align 4 convertloop: vmovdqu ymm0, [eax] // read 32 U's vmovdqu ymm1, [eax + edx] // and 32 V's lea eax, [eax + 32] vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 - vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 - vmovdqu [edi], ymm1 - vmovdqu [edi + 32], ymm2 + vextractf128 [edi], ymm2, 0 // bytes 0..15 + vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 + vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 + vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 lea edi, [edi + 64] sub ecx, 32 jg convertloop @@ -3672,13 +2839,12 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 jg convertloop @@ -3687,7 +2853,32 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 -// Unaligned Multiple of 1. +#ifdef HAS_COPYROW_AVX +// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. +__declspec(naked) __declspec(align(16)) +void CopyRow_AVX(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 64 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_COPYROW_AVX + +// Multiple of 1. __declspec(naked) __declspec(align(16)) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { @@ -3703,24 +2894,6 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { } } -#ifdef HAS_COPYROW_X86 -__declspec(naked) __declspec(align(16)) -void CopyRow_X86(const uint8* src, uint8* dst, int count) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count - shr ecx, 2 - rep movsd - mov edi, edx - mov esi, eax - ret - } -} -#endif // HAS_COPYROW_X86 - #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels __declspec(naked) __declspec(align(16)) @@ -3734,21 +2907,20 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 - align 4 convertloop: - movdqa xmm2, [eax] - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] + movdqu xmm3, [eax + 16] lea eax, [eax + 32] - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3769,7 +2941,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - align 4 convertloop: vmovdqu ymm1, [eax] vmovdqu ymm2, [eax + 32] @@ -3801,23 +2972,22 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 - align 4 convertloop: movq xmm2, qword ptr [eax] // 8 Y's lea eax, [eax + 8] punpcklbw xmm2, xmm2 punpckhwd xmm3, xmm2 punpcklwd xmm2, xmm2 - movdqa xmm4, [edx] - movdqa xmm5, [edx + 16] + movdqu xmm4, [edx] + movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -3838,7 +3008,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff - align 4 convertloop: vpmovzxbd ymm1, qword ptr [eax] vpmovzxbd ymm2, qword ptr [eax + 8] @@ -3860,13 +3029,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -// SetRow8 writes 'count' bytes using a 32 bit value repeated. +// Write 'count' bytes using an 8 bit value repeated. +// Count should be multiple of 4. __declspec(naked) __declspec(align(16)) -void SetRow_X86(uint8* dst, uint32 v32, int count) { +void SetRow_X86(uint8* dst, uint8 v8, int count) { __asm { + movzx eax, byte ptr [esp + 8] // v8 + mov edx, 0x01010101 // Duplicate byte to all bytes. + mul edx // overwrites edx with upper part of result. mov edx, edi mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 mov ecx, [esp + 12] // count shr ecx, 2 rep stosd @@ -3875,33 +3047,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) { } } -// SetRow32 writes 'count' words using a 32 bit value repeated. +// Write 'count' bytes using an 8 bit value repeated. __declspec(naked) __declspec(align(16)) -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { +void SetRow_ERMS(uint8* dst, uint8 v8, int count) { __asm { - push esi - push edi - push ebp - mov edi, [esp + 12 + 4] // dst - mov eax, [esp + 12 + 8] // v32 - mov ebp, [esp + 12 + 12] // width - mov edx, [esp + 12 + 16] // dst_stride - mov esi, [esp + 12 + 20] // height - lea ecx, [ebp * 4] - sub edx, ecx // stride - width * 4 + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // count + rep stosb + mov edi, edx + ret + } +} - align 4 - convertloop: - mov ecx, ebp +// Write 'count' 32 bit values. +__declspec(naked) __declspec(align(16)) +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count rep stosd - add edi, edx - sub esi, 1 - jg convertloop - - pop ebp - pop edi - pop esi + mov edi, edx ret } } @@ -3918,7 +3087,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3927,9 +3095,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -3951,7 +3119,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3994,7 +3161,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -4029,7 +3195,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -4038,12 +3203,12 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop - ret vzeroupper + ret } } @@ -4062,7 +3227,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -4105,7 +3269,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -4144,17 +3307,16 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -4175,12 +3337,11 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, psrlw xmm5, 8 sub edi, edx - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 @@ -4217,114 +3378,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, psrlw xmm5, 8 sub edi, edx - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4356,17 +3409,16 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -4387,12 +3439,11 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, psrlw xmm5, 8 sub edi, edx - align 4 convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 @@ -4429,112 +3480,6 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, psrlw xmm5, 8 sub edi, edx - align 4 - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_yuy2 - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4607,9 +3552,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge alignloop1 alignloop1b: @@ -4638,9 +3583,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jge convertloop4 convertloop4b: @@ -4669,9 +3614,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop1b: @@ -4739,48 +3684,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge alignloop1 alignloop1b: add ecx, 1 - 4 jl convertloop4b - test eax, 15 // unaligned? - jne convertuloop4 - test esi, 15 // unaligned? - jne convertuloop4 - // 4 pixel loop. convertloop4: - movdqa xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi] // _r_b - pshufb xmm3, kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jge convertloop4 - jmp convertloop4b - - // 4 pixel unaligned loop. - convertuloop4: movdqu xmm3, [eax] // src argb lea eax, [eax + 16] movdqa xmm0, xmm3 // src argb @@ -4799,10 +3713,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] - jge convertuloop4 + sub ecx, 4 + jge convertloop4 convertloop4b: add ecx, 4 - 1 @@ -4828,9 +3742,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop1b: @@ -4842,7 +3756,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -4854,19 +3767,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff psrld xmm5, 8 - align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 pshufhw xmm2, xmm0, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm0, xmm2 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm1 // next 2 pixels pshufhw xmm2, xmm1, 0FFh // 8 alpha words pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm2, [eax] // alphas + movdqu xmm2, [eax] // alphas lea eax, [eax + 16] psrlw xmm0, 8 pand xmm2, xmm4 @@ -4874,9 +3786,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { packuswb xmm0, xmm1 pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -4904,7 +3816,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { movdqa xmm4, kShuffleAlpha0 movdqa xmm5, kShuffleAlpha1 - align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels pshufb xmm0, xmm4 // isolate first 2 alphas @@ -4923,9 +3834,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { psrlw xmm1, 8 packuswb xmm0, xmm1 por xmm0, xmm2 // copy original alpha - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -4935,11 +3846,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const ulvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, +static const uvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u }; __declspec(naked) __declspec(align(16)) void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -4948,11 +3856,10 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax - vmovdqa ymm4, kShuffleAlpha_AVX2 + vbroadcastf128 ymm4,kShuffleAlpha_AVX2 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 - align 4 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. @@ -4966,9 +3873,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // unmutated. vpor ymm0, ymm0, ymm6 // copy original alpha - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop vzeroupper @@ -4979,7 +3886,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -4990,7 +3896,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8 + 8] // dst_argb mov ecx, [esp + 8 + 12] // width - align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha @@ -5016,9 +3921,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, lea eax, [eax + 16] packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop edi pop esi @@ -5029,9 +3934,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const ulvec8 kUnattenShuffleAlpha_AVX2 = { - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, - 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u }; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. @@ -5044,9 +3948,8 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax - vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 + vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 - align 4 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. @@ -5061,9 +3964,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop vzeroupper @@ -5080,12 +3983,11 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax - vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 + vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 push esi push edi - align 4 convertloop: // replace VPGATHER movzx esi, byte ptr [eax + 3] // alpha0 @@ -5123,9 +4025,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop pop edi @@ -5148,18 +4050,17 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - align 4 convertloop: - movdqa xmm0, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes - movdqa xmm2, [eax] // A - movdqa xmm3, [eax + 16] + movdqu xmm2, [eax] // A + movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 @@ -5171,10 +4072,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { movdqa xmm1, xmm0 punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg convertloop ret } @@ -5208,32 +4109,31 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { movdqa xmm3, kARGBToSepiaG movdqa xmm4, kARGBToSepiaR - align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm6, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 B values - movdqa xmm5, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 G values punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] + movdqu xmm5, [eax] // R + movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 R values - movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 @@ -5242,10 +4142,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { movdqa xmm1, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 - sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqu [eax], xmm0 + movdqu [eax + 16], xmm1 lea eax, [eax + 32] + sub ecx, 8 jg convertloop ret } @@ -5271,14 +4171,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, pshufd xmm5, xmm5, 0xff mov ecx, [esp + 16] /* width */ - align 4 convertloop: - movdqa xmm0, [eax] // B - movdqa xmm7, [eax + 16] + movdqu xmm0, [eax] // B + movdqu xmm7, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm7, xmm2 - movdqa xmm6, [eax] // G - movdqa xmm1, [eax + 16] + movdqu xmm6, [eax] // G + movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 phaddsw xmm0, xmm7 // B @@ -5288,13 +4187,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, packuswb xmm0, xmm0 // 8 B values packuswb xmm6, xmm6 // 8 G values punpcklbw xmm0, xmm6 // 8 BG values - movdqa xmm1, [eax] // R - movdqa xmm7, [eax + 16] + movdqu xmm1, [eax] // R + movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 phaddsw xmm1, xmm7 // R - movdqa xmm6, [eax] // A - movdqa xmm7, [eax + 16] + movdqu xmm6, [eax] // A + movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A @@ -5306,11 +4205,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movdqa xmm6, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm6 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] + sub ecx, 8 jg convertloop ret } @@ -5319,7 +4218,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { @@ -5339,25 +4237,24 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, pcmpeqb xmm6, xmm6 // generate mask 0xff000000 pslld xmm6, 24 - align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels pmulhuw xmm0, xmm2 // pixel * scale >> 16 - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 pmullw xmm0, xmm3 // * interval_size - movdqa xmm7, [eax] // read 4 pixels + movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 pand xmm7, xmm6 // mask alpha paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 - sub ecx, 4 - movdqa [eax], xmm0 + movdqu [eax], xmm0 lea eax, [eax + 16] + sub ecx, 4 jg convertloop ret } @@ -5366,7 +4263,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { @@ -5378,9 +4274,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 - align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 @@ -5390,9 +4285,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -5413,7 +4308,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 - align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 movdqu xmm2, [esi] // read 4 pixels from src_argb1 @@ -5428,9 +4322,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop esi @@ -5455,16 +4349,15 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, sub ecx, 4 jl convertloop49 - align 4 convertloop4: movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] paddusb xmm0, xmm1 // src_argb0 + src_argb1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jge convertloop4 convertloop49: @@ -5477,9 +4370,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] paddusb xmm0, xmm1 // src_argb0 + src_argb1 - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop19: @@ -5501,16 +4394,15 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] psubusb xmm0, xmm1 // src_argb0 - src_argb1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop esi @@ -5532,7 +4424,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov ecx, [esp + 4 + 16] // width vpxor ymm5, ymm5, ymm5 // constant 0 - align 4 convertloop: vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] @@ -5569,7 +4460,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - align 4 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] @@ -5599,7 +4489,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - align 4 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 lea eax, [eax + 32] @@ -5638,7 +4527,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, sub edx, eax pxor xmm5, xmm5 // constant 0 - align 4 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] @@ -5662,9 +4550,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] + sub ecx, 8 jg convertloop pop edi @@ -5692,7 +4580,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, sub edx, eax pxor xmm5, xmm5 // constant 0 - align 4 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] @@ -5716,9 +4603,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] + sub ecx, 8 jg convertloop pop esi @@ -5746,10 +4633,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, pcmpeqb xmm5, xmm5 // alpha 255 pslld xmm5, 24 // 0xff000000 - align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqa xmm2, xmm0 // GG @@ -5765,12 +4651,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, punpckhwd xmm0, xmm0 // Last 4 por xmm3, xmm5 // GGGA por xmm0, xmm5 - sub ecx, 16 - movdqa [edx], xmm1 - movdqa [edx + 16], xmm2 - movdqa [edx + 32], xmm3 - movdqa [edx + 48], xmm0 + movdqu [edx], xmm1 + movdqu [edx + 16], xmm2 + movdqu [edx + 32], xmm3 + movdqu [edx + 48], xmm0 lea edx, [edx + 64] + sub ecx, 16 jg convertloop pop esi @@ -5792,15 +4678,14 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, mov ecx, [esp + 4 + 16] // width sub esi, eax - align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop pop esi @@ -5827,10 +4712,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, sub esi, eax pcmpeqb xmm5, xmm5 // alpha 255 - align 4 convertloop: - movdqa xmm0, [eax] // read 16 pixels src_sobelx - movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 paddusb xmm2, xmm1 // sobel = sobelx + sobely @@ -5846,12 +4730,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, movdqa xmm7, xmm1 // YSXA punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 - sub ecx, 16 - movdqa [edx], xmm6 - movdqa [edx + 16], xmm4 - movdqa [edx + 32], xmm7 - movdqa [edx + 48], xmm1 + movdqu [edx], xmm6 + movdqu [edx + 16], xmm4 + movdqu [edx + 32], xmm7 + movdqu [edx + 48], xmm1 lea edx, [edx + 64] + sub ecx, 16 jg convertloop pop esi @@ -5872,8 +4756,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // area is the number of pixels in the area being averaged. // dst points to pixel to store result to. // count is number of averaged pixels to produce. -// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte -// aligned. +// Does 4 pixels at a time. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) { @@ -5903,13 +4786,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, packssdw xmm5, xmm5 // 16 bit shorts // 4 pixel loop small blocks. - align 4 s4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -5946,13 +4828,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, jmp l4b // 4 pixel loop - align 4 l4: // top left - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] @@ -5999,9 +4880,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, jl l1b // 1 pixel loop - align 4 l1: - movdqa xmm0, [eax] + movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] lea eax, [eax + 16] psubd xmm0, [esi] @@ -6040,7 +4920,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, jne l4b // 4 pixel loop - align 4 l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -6057,26 +4936,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpckhwd xmm5, xmm1 paddd xmm0, xmm2 - movdqa xmm2, [esi] // previous row above. + movdqu xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 - movdqa xmm3, [esi + 16] + movdqu xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 - movdqa xmm4, [esi + 32] + movdqu xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 - movdqa xmm5, [esi + 48] + movdqu xmm5, [esi + 48] lea esi, [esi + 64] paddd xmm5, xmm0 - movdqa [edx], xmm2 - movdqa [edx + 16], xmm3 - movdqa [edx + 32], xmm4 - movdqa [edx + 48], xmm5 + movdqu [edx], xmm2 + movdqu [edx + 16], xmm3 + movdqu [edx + 32], xmm4 + movdqu [edx + 48], xmm5 lea edx, [edx + 64] sub ecx, 4 @@ -6087,7 +4966,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, jl l1b // 1 pixel loop - align 4 l1: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. lea eax, [eax + 4] @@ -6142,7 +5020,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, addps xmm4, xmm4 // dudv *= 4 // 4 pixel loop - align 4 l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 @@ -6164,9 +5041,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, movd xmm0, [eax + edi] // read pixel 3 punpckldq xmm6, xmm0 // combine pixel 2 and 3 addps xmm3, xmm4 // x, y += dx, dy next 2 - sub ecx, 4 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] + sub ecx, 4 jge l4 l4b: @@ -6174,7 +5051,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, jl l1b // 1 pixel loop - align 4 l1: cvttps2dq xmm0, xmm2 // x, y float to int packssdw xmm0, xmm0 // x, y as shorts @@ -6182,9 +5058,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge l1 l1b: pop edi @@ -6195,11 +5071,11 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_AVX2 -// Bilinear filter 16x2 -> 16x1 +// Bilinear filter 32x2 -> 32x1 __declspec(naked) __declspec(align(16)) void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi @@ -6229,7 +5105,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpxor ymm0, ymm0, ymm0 vpermd ymm5, ymm0, ymm5 - align 4 xloop: vmovdqu ymm0, [esi] vmovdqu ymm2, [esi + edx] @@ -6240,51 +5115,49 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpsrlw ymm0, ymm0, 7 vpsrlw ymm1, ymm1, 7 vpackuswb ymm0, ymm0, ymm1 // unmutates - sub ecx, 32 vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] + sub ecx, 32 jg xloop jmp xloop99 - // Blend 25 / 75. - align 4 - xloop25: - vmovdqu ymm0, [esi] - vpavgb ymm0, ymm0, [esi + edx] - vpavgb ymm0, ymm0, [esi + edx] - sub ecx, 32 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - jg xloop25 - jmp xloop99 + // Blend 25 / 75. + xloop25: + vmovdqu ymm0, [esi] + vmovdqu ymm1, [esi + edx] + vpavgb ymm0, ymm0, ymm1 + vpavgb ymm0, ymm0, ymm1 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop25 + jmp xloop99 - // Blend 50 / 50. - align 4 - xloop50: - vmovdqu ymm0, [esi] - vpavgb ymm0, ymm0, [esi + edx] - sub ecx, 32 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - jg xloop50 - jmp xloop99 + // Blend 50 / 50. + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop50 + jmp xloop99 - // Blend 75 / 25. - align 4 - xloop75: - vmovdqu ymm0, [esi + edx] - vpavgb ymm0, ymm0, [esi] - vpavgb ymm0, ymm0, [esi] - sub ecx, 32 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - jg xloop75 - jmp xloop99 + // Blend 75 / 25. + xloop75: + vmovdqu ymm1, [esi] + vmovdqu ymm0, [esi + edx] + vpavgb ymm0, ymm0, ymm1 + vpavgb ymm0, ymm0, ymm1 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg xloop75 + jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. - align 4 - xloop100: - rep movsb + // Blend 100 / 0 - Copy row unchanged. + xloop100: + rep movsb xloop99: pop edi @@ -6295,7 +5168,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, @@ -6329,11 +5201,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 - align 4 xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - movdqa xmm1, xmm0 + movdqu xmm0, [esi] + movdqu xmm2, [esi + edx] + movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 pmaddubsw xmm0, xmm5 @@ -6341,57 +5212,53 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 + movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop jmp xloop99 // Blend 25 / 75. - align 4 xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 + movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop25 jmp xloop99 // Blend 50 / 50. - align 4 xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] + movdqu xmm0, [esi] + movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 + movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop50 jmp xloop99 // Blend 75 / 25. - align 4 xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] + movdqu xmm1, [esi] + movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 + movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop75 jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 4 xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 + movdqu xmm0, [esi] + movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop100 xloop99: @@ -6400,7 +5267,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ret } } -#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -6435,225 +5301,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, punpcklqdq xmm5, xmm5 pxor xmm4, xmm4 - align 4 - xloop: - movdqa xmm0, [esi] // row0 - movdqa xmm2, [esi + edx] // row1 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - psubw xmm2, xmm0 // row1 - row0 - psubw xmm3, xmm1 - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 - paddw xmm3, xmm3 - pmulhw xmm2, xmm5 // scale diff - pmulhw xmm3, xmm5 - paddw xmm0, xmm2 // sum rows - paddw xmm1, xmm3 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 4 - xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 4 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 4 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 4 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} -#endif // HAS_INTERPOLATEROW_SSE2 - -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) -void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - shr eax, 1 - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 128. Blend 100 / 0. - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. - cmp eax, 64 - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. - - movd xmm0, eax // high fraction 0..127 - neg eax - add eax, 128 - movd xmm5, eax // low fraction 128..1 - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - - align 4 - xloop: - movdqu xmm0, [esi] - movdqu xmm2, [esi + edx] - movdqu xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 4 - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 4 - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 4 - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 4 - xloop100: - movdqu xmm0, [esi] - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} - -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) -void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - cmp eax, 64 - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - cmp eax, 192 - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. - - movd xmm5, eax // xmm5 = y fraction - punpcklbw xmm5, xmm5 - psrlw xmm5, 1 - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - punpcklqdq xmm5, xmm5 - pxor xmm4, xmm4 - - align 4 xloop: movdqu xmm0, [esi] // row0 movdqu xmm2, [esi + edx] // row1 @@ -6672,57 +5319,53 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, paddw xmm0, xmm2 // sum rows paddw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop jmp xloop99 // Blend 25 / 75. - align 4 xloop25: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop25 jmp xloop99 // Blend 50 / 50. - align 4 xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop50 jmp xloop99 // Blend 75 / 25. - align 4 xloop75: movdqu xmm1, [esi] movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop75 jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 4 xloop100: movdqu xmm0, [esi] - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop100 xloop99: @@ -6733,84 +5376,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 -__declspec(naked) __declspec(align(16)) -void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // src_uv_stride - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - sub edi, eax - - align 4 - convertloop: - movdqa xmm0, [eax] - pavgb xmm0, [eax + edx] - sub ecx, 16 - movdqa [eax + edi], xmm0 - lea eax, [eax + 16] - jg convertloop - pop edi - ret - } -} - -#ifdef HAS_HALFROW_AVX2 -__declspec(naked) __declspec(align(16)) -void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // src_uv_stride - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - sub edi, eax - - align 4 - convertloop: - vmovdqu ymm0, [eax] - vpavgb ymm0, ymm0, [eax + edx] - sub ecx, 32 - vmovdqu [eax + edi], ymm0 - lea eax, [eax + 32] - jg convertloop - - pop edi - vzeroupper - ret - } -} -#endif // HAS_HALFROW_AVX2 - -__declspec(naked) __declspec(align(16)) -void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer - movd xmm5, [esp + 12] // selector - mov ecx, [esp + 16] // pix - pshufd xmm5, xmm5, 0 - - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - punpckldq xmm0, xmm1 - sub ecx, 8 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jg wloop - ret - } -} - // Specialized ARGB to Bayer that just isolates G channel. __declspec(naked) __declspec(align(16)) void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, @@ -6823,10 +5388,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, pcmpeqb xmm5, xmm5 // generate mask 0x000000ff psrld xmm5, 24 - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrld xmm0, 8 // Move green to bottom. psrld xmm1, 8 @@ -6834,9 +5398,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, pand xmm1, xmm5 packssdw xmm0, xmm1 packuswb xmm0, xmm1 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop ret } @@ -6850,46 +5414,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] + movdqu xmm5, [ecx] mov ecx, [esp + 16] // pix - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pshufb xmm0, xmm5 - pshufb xmm1, xmm5 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - jg wloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // shuffler - movdqa xmm5, [ecx] - mov ecx, [esp + 16] // pix - - align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 - sub ecx, 8 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg wloop ret } @@ -6906,17 +5443,16 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // pix - align 4 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpshufb ymm0, ymm0, ymm5 vpshufb ymm1, ymm1, ymm5 - sub ecx, 16 vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] + sub ecx, 16 jg wloop vzeroupper @@ -6967,7 +5503,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, jg shuf_any1 jmp shuf99 - align 4 shuf_0123: movdqu xmm0, [eax] lea eax, [eax + 16] @@ -6979,13 +5514,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 01Bh pshuflw xmm1, xmm1, 01Bh packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_0123 jmp shuf99 - align 4 shuf_0321: movdqu xmm0, [eax] lea eax, [eax + 16] @@ -6997,13 +5531,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 039h pshuflw xmm1, xmm1, 039h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_0321 jmp shuf99 - align 4 shuf_2103: movdqu xmm0, [eax] lea eax, [eax + 16] @@ -7015,13 +5548,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 093h pshuflw xmm1, xmm1, 093h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_2103 jmp shuf99 - align 4 shuf_3012: movdqu xmm0, [eax] lea eax, [eax + 16] @@ -7033,9 +5565,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 0C6h pshuflw xmm1, xmm1, 0C6h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_3012 shuf99: @@ -7066,7 +5598,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, mov ecx, [esp + 8 + 20] // width sub edx, esi - align 4 convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V @@ -7104,7 +5635,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, mov ecx, [esp + 8 + 20] // width sub edx, esi - align 4 convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V @@ -7141,7 +5671,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. // 2 pixel loop. - align 4 convertloop: // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel @@ -7177,9 +5706,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, cvttps2dq xmm4, xmm4 packuswb xmm0, xmm4 packuswb xmm0, xmm0 - sub ecx, 2 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 2 jg convertloop pop esi ret @@ -7203,7 +5732,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, mov ecx, [esp + 16] /* width */ // 2 pixel loop. - align 4 convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels lea eax, [eax + 8] @@ -7217,9 +5745,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 - sub ecx, 2 vmovq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 2 jg convertloop vzeroupper ret @@ -7239,7 +5767,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. - align 4 convertloop: movzx edx, byte ptr [eax] lea eax, [eax + 4] @@ -7273,7 +5800,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. - align 4 convertloop: movzx edx, byte ptr [eax] lea eax, [eax + 4] @@ -7315,7 +5841,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, pxor xmm5, xmm5 // 4 pixel loop. - align 4 convertloop: movdqu xmm0, qword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 @@ -7382,9 +5907,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movzx edx, byte ptr [eax + 15] // copy alpha. mov byte ptr [edi + 15], dl - sub ecx, 4 lea eax, [eax + 16] lea edi, [edi + 16] + sub ecx, 4 jg convertloop pop edi diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc index 5b33b5f04..482c5a61e 100644 --- a/third_party/libyuv/source/scale.cc +++ b/third_party/libyuv/source/scale.cc @@ -57,20 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; } -#elif defined(HAS_SCALEROWDOWN2_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : - ScaleRowDown2Box_Unaligned_SSE2); - if (IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); - } + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -112,21 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height, ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } -#elif defined(HAS_SCALEROWDOWN2_16_SSE2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? - ScaleRowDown2_Unaligned_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 : - ScaleRowDown2Box_Unaligned_16_SSE2); - if (IS_ALIGNED(src_ptr, 16) && - IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : - ScaleRowDown2Box_16_SSE2); - } + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : + ScaleRowDown2Box_16_SSE2); } -#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -168,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height, if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } -#elif defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -212,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height, ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } -#elif defined(HAS_SCALEROWDOWN4_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN4_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } -#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -271,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; @@ -351,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height, } #endif #if defined(HAS_SCALEROWDOWN34_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; @@ -445,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; @@ -456,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -522,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; } } -#elif defined(HAS_SCALEROWDOWN38_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { +#endif +#if defined(HAS_SCALEROWDOWN38_16_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; @@ -533,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height, ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } -#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) +#endif +#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { @@ -758,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height, uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; #if defined(HAS_SCALEADDROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_SSE2; } #endif @@ -830,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height, uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; #if defined(HAS_SCALEADDROWS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) #ifdef AVOID_OVERREAD - IS_ALIGNED(src_width, 16) && + && IS_ALIGNED(src_width, 16) #endif - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + ) { ScaleAddRows = ScaleAddRows_16_SSE2; } #endif @@ -886,29 +875,23 @@ void ScalePlaneBilinearDown(int src_width, int src_height, src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_AVX2; @@ -916,7 +899,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_NEON; @@ -924,7 +907,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; if (IS_ALIGNED(src_width, 4)) { InterpolateRow = InterpolateRow_MIPS_DSPR2; @@ -988,29 +971,23 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; @@ -1018,7 +995,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; @@ -1026,7 +1003,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; if (IS_ALIGNED(src_width, 4)) { InterpolateRow = InterpolateRow_16_MIPS_DSPR2; @@ -1087,29 +1064,23 @@ void ScalePlaneBilinearUp(int src_width, int src_height, src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_AVX2; @@ -1117,7 +1088,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_NEON; @@ -1125,7 +1096,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_MIPS_DSPR2; @@ -1144,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_SSE2; } #endif @@ -1226,29 +1195,23 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; @@ -1256,7 +1219,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; @@ -1264,7 +1227,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_16_MIPS_DSPR2; @@ -1283,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif @@ -1366,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_SSE2; } #endif @@ -1401,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_16_SSE2; } #endif diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc index e339cd7c7..05b58e1ba 100644 --- a/third_party/libyuv/source/scale_argb.cc +++ b/third_party/libyuv/source/scale_argb.cc @@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height, } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : ScaleARGBRowDown2Box_SSE2); } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : ScaleARGBRowDown2_NEON; } @@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; } #endif @@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height, assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } -#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 4)) { +#endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; } @@ -190,29 +185,23 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, src_argb += xl * 4; x -= (int)(xl << 16); #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(clip_src_width, 32)) { InterpolateRow = InterpolateRow_AVX2; @@ -220,15 +209,15 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(clip_src_width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 && +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; if (IS_ALIGNED(clip_src_width, 4)) { @@ -286,29 +275,23 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_AVX2; @@ -316,15 +299,15 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_MIPS_DSPR2; } @@ -346,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -427,18 +408,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, uint8* rgb_buf, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) { + if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(src_width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) { + if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(src_width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; @@ -446,7 +424,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) { + if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(src_width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; @@ -467,29 +445,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_AVX2; @@ -497,15 +469,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { InterpolateRow = InterpolateRow_MIPS_DSPR2; } @@ -531,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif @@ -640,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height, if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc index e4b2acc41..96e2564b0 100644 --- a/third_party/libyuv/source/scale_common.cc +++ b/third_party/libyuv/source/scale_common.cc @@ -885,31 +885,23 @@ void ScalePlaneVertical(int src_height, assert(dst_height > 0); src_argb += (x >> 16) * bpp; #if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } + InterpolateRow = InterpolateRow_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } + InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width_bytes, 32)) { InterpolateRow = InterpolateRow_AVX2; @@ -917,15 +909,15 @@ void ScalePlaneVertical(int src_height, } #endif #if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; @@ -967,31 +959,23 @@ void ScalePlaneVertical_16(int src_height, assert(dst_height > 0); src_argb += (x >> 16) * wpp; #if defined(HAS_INTERPOLATEROW_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSE2; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSE2; - } + InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_Unaligned_16_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - InterpolateRow = InterpolateRow_16_SSSE3; - } + InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(dst_width_bytes, 32)) { InterpolateRow = InterpolateRow_16_AVX2; @@ -999,15 +983,15 @@ void ScalePlaneVertical_16(int src_height, } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) - if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_16_NEON; } } #endif -#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && +#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2; diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc index 1b8a5ba58..7921219b5 100644 --- a/third_party/libyuv/source/scale_neon.cc +++ b/third_party/libyuv/source/scale_neon.cc @@ -16,7 +16,8 @@ extern "C" { #endif // This module is for GCC Neon. -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ + !defined(__aarch64__) // NEON downscalers with interpolation. // Provided by Fritz Koenig @@ -756,7 +757,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } -#endif // __ARM_NEON__ +#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc index 64c7d10db..fb68b67d2 100644 --- a/third_party/libyuv/source/scale_neon64.cc +++ b/third_party/libyuv/source/scale_neon64.cc @@ -8,133 +8,122 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "libyuv/scale.h" #include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// This module is for GCC Neon. +// This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -#ifdef HAS_SCALEROWDOWN2_NEON + // Read 32x1 throw away even pixels, and write 16x1. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" - // load even pixels into q0, odd into q1 + // load even pixels into v0, odd into v1 MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %2, %2, #16 \n" // 16 processed per loop MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "bgt 1b \n" + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : - : "q0", "q1" // Clobber List + : "v0", "v1" // Clobber List ); } -#endif //HAS_SCALEROWDOWN2_NEON -#ifdef HAS_SCALEROWDOWN2_NEON // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" - ".p2align 2 \n" + "add %1, %1, %0 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc MEMACCESS(1) - "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop - "vpaddl.u8 q0, q0 \n" // row 1 add adjacent - "vpaddl.u8 q1, q1 \n" - "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 - "vpadal.u8 q1, q3 \n" - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #2 \n" MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : - : "q0", "q1", "q2", "q3" // Clobber List + : "v0", "v1", "v2", "v3" // Clobber List ); } -#endif //HAS_SCALEROWDOWN2_NEON -#ifdef HAS_SCALEROWDOWN4_NEON void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "q0", "q1", "memory", "cc" + : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN4_NEON -#ifdef HAS_SCALEROWDOWN4_NEON void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 MEMACCESS(3) - "vld1.8 {q1}, [%3]! \n" + "ld1 {v1.16b}, [%2], #16 \n" MEMACCESS(4) - "vld1.8 {q2}, [%4]! \n" + "ld1 {v2.16b}, [%3], #16 \n" MEMACCESS(5) - "vld1.8 {q3}, [%5]! \n" - "subs %2, %2, #4 \n" - "vpaddl.u8 q0, q0 \n" - "vpadal.u8 q0, q1 \n" - "vpadal.u8 q0, q2 \n" - "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %5, %5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding MEMACCESS(1) - "vst1.32 {d0[0]}, [%1]! \n" - "bgt 1b \n" + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 : - : "q0", "q1", "q2", "q3", "memory", "cc" + : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN4_NEON -#ifdef HAS_SCALEROWDOWN34_NEON // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. @@ -142,136 +131,129 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" - "1: \n" + "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %2, %2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "d0", "d1", "d2", "d3", "memory", "cc" + : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN34_NEON -#ifdef HAS_SCALEROWDOWN34_NEON void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - ".p2align 2 \n" - "1: \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "vmovl.u8 q8, d4 \n" - "vmovl.u8 q9, d5 \n" - "vmovl.u8 q10, d6 \n" - "vmovl.u8 q11, d7 \n" + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24 \n" - "vmlal.u8 q9, d1, d24 \n" - "vmlal.u8 q10, d2, d24 \n" - "vmlal.u8 q11, d3, d24 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2 \n" - "vqrshrn.u16 d1, q9, #2 \n" - "vqrshrn.u16 d2, q10, #2 \n" - "vqrshrn.u16 d3, q11, #2 \n" + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1 \n" - "vmlal.u8 q8, d0, d24 \n" - "vqrshrn.u16 d0, q8, #2 \n" + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2 \n" - "vmlal.u8 q8, d3, d24 \n" - "vqrshrn.u16 d2, q8, #2 \n" + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "bgt 1b \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", + "v20", "memory", "cc" ); } -#endif //ScaleRowDown34_0_Box_NEON -#ifdef HAS_SCALEROWDOWN34_NEON void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" - ".p2align 2 \n" - "1: \n" + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, %2, #24 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %2, %2, #24 \n" // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2 \n" - "vrhadd.u8 q1, q1, q3 \n" + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1 \n" - "vmlal.u8 q3, d0, d24 \n" - "vqrshrn.u16 d0, q3, #2 \n" + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2 \n" + "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2 \n" - "vmlal.u8 q3, d3, d24 \n" - "vqrshrn.u16 d2, q3, #2 \n" + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" MEMACCESS(1) - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : - : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN34_NEON -#ifdef HAS_SCALEROWDOWN38_NEON -#define HAS_SCALEROWDOWN38_NEON static uvec8 kShuf38 = { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; static uvec8 kShuf38_2 = - { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; + { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 }; static vec16 kMult38_Div6 = { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; @@ -285,504 +267,498 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, uint8* dst_ptr, int dst_width) { asm volatile ( MEMACCESS(3) - "vld1.8 {q3}, [%3] \n" - ".p2align 2 \n" - "1: \n" + "ld1 {v3.16b}, [%3] \n" + "1: \n" MEMACCESS(0) - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %2, %2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" + "st1 {v2.8b}, [%1], #8 \n" MEMACCESS(1) - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"(&kShuf38) // %3 - : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" + : "v0", "v1", "v2", "v3", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN38_NEON - -#ifdef HAS_SCALEROWDOWN38_NEON // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; asm volatile ( MEMACCESS(5) - "vld1.16 {q13}, [%5] \n" + "ld1 {v29.8h}, [%5] \n" MEMACCESS(6) - "vld1.8 {q14}, [%6] \n" + "ld1 {v30.16b}, [%6] \n" MEMACCESS(7) - "vld1.8 {q15}, [%7] \n" - "add %3, %0 \n" - ".p2align 2 \n" - "1: \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" MEMACCESS(4) - "vld4.8 {d16, d17, d18, d19}, [%4]! \n" - "subs %2, %2, #12 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %4, %4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" - "vtrn.u8 d16, d17 \n" + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" - "vtrn.u8 d18, d19 \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" - "vpaddl.u8 q8, q8 \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" - "vpaddl.u8 d19, d19 \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 q0, q8 \n" - "vadd.u16 d4, d3, d7 \n" - "vadd.u16 d4, d19 \n" + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q2, q13 \n" - "vmovn.u16 d4, q2 \n" + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" - "vmovl.u8 q9, d18 \n" + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" // combine source lines - "vadd.u16 q1, q3 \n" - "vadd.u16 q1, q9 \n" + "add v0.8h, v0.8h, v16.8h \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q15 \n" + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" - - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" + "st1 {v3.8b}, [%1], #8 \n" MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 : "r"(&kMult38_Div6), // %5 "r"(&kShuf38_2), // %6 "r"(&kMult38_Div9) // %7 - : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", + "v30", "v31", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN38_NEON -#ifdef HAS_SCALEROWDOWN38_NEON // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; asm volatile ( MEMACCESS(4) - "vld1.16 {q13}, [%4] \n" + "ld1 {v30.8h}, [%4] \n" MEMACCESS(5) - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" - ".p2align 2 \n" - "1: \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" - // d0 = 00 40 01 41 02 42 03 43 - // d1 = 10 50 11 51 12 52 13 53 - // d2 = 20 60 21 61 22 62 23 63 - // d3 = 30 70 31 71 32 72 33 73 + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "vld4.8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, %2, #12 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %3, %3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // d0 = 00 10 01 11 02 12 03 13 - // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1 \n" - "vtrn.u8 d4, d5 \n" + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" - // d2 = 20 30 21 31 22 32 23 33 - // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3 \n" - "vtrn.u8 d6, d7 \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" - // d0 = 00+10 01+11 02+12 03+13 - // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0 \n" - "vpaddl.u8 q2, q2 \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" - // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3 \n" - "vpaddl.u8 d7, d7 \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" // combine source lines - "vadd.u16 q0, q2 \n" - "vadd.u16 d4, d3, d7 \n" + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2 \n" + "uqrshrn v2.8b, v2.8h, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. - // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q3, d6 \n" + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines - "vadd.u16 q1, q3 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" - // d4 = xx 20 xx 30 xx 22 xx 32 - // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3 \n" - - // d4 = xx 20 xx 21 xx 22 xx 23 - // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3 \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1 \n" + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "vqrdmulh.s16 q0, q0, q13 \n" + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28 \n" - "vtbl.u8 d4, {d0, d1, d2}, d29 \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" + "st1 {v3.8b}, [%1], #8 \n" MEMACCESS(1) - "vst1.32 {d4[0]}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v30", "v31", "memory", "cc" ); } -#endif //HAS_SCALEROWDOWN38_NEON -#if 0 // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + int y_fraction = 256 - source_y_fraction; asm volatile ( "cmp %4, #0 \n" - "beq 100f \n" - "add %2, %1 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" "cmp %4, #64 \n" - "beq 75f \n" + "b.eq 75f \n" "cmp %4, #128 \n" - "beq 50f \n" + "b.eq 50f \n" "cmp %4, #192 \n" - "beq 25f \n" + "b.eq 25f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" // General purpose row blend. "1: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 1b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 50b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" + "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" + "ld1 {v0.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" "subs %3, %3, #16 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 100b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" "99: \n" MEMACCESS(0) - "vst1.8 {d1[7]}, [%0] \n" + "st1 {v0.b}[15], [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 + "+r"(source_y_fraction),// %4 + "+r"(y_fraction) // %5 : - : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" ); } -#endif //0 -#ifdef HAS_SCALEARGBROWDOWN2_NEON void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 - MEMACCESS(0) - "vld2.32 {q0, q1}, [%0]! \n" - MEMACCESS(0) - "vld2.32 {q2, q3}, [%0]! \n" + MEMACCESS (0) + "ld2 {v0.4s, v1.4s}, [%0], #32 \n" + MEMACCESS (0) + "ld2 {v2.4s, v3.4s}, [%0], #32 \n" "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store odd pixels - MEMACCESS(1) - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 + MEMACCESS (1) + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + MEMACCESS (1) + "st1 {v3.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (dst), // %1 + "+r" (dst_width) // %2 : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } -#endif //HAS_SCALEARGBROWDOWN2_NEON -#ifdef HAS_SCALEARGBROWDOWN2_NEON void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - MEMACCESS(0) - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + MEMACCESS (0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - MEMACCESS(1) - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - MEMACCESS(1) - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + MEMACCESS (1) + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + MEMACCESS (2) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r" (src_ptr), // %0 + "+r" (src_stride), // %1 + "+r" (dst), // %2 + "+r" (dst_width) // %3 : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" ); } -#endif //HAS_SCALEARGBROWDOWN2_NEON -#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( - "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], r12 \n" + "ld1 {v0.s}[0], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], r12 \n" + "ld1 {v0.s}[1], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], r12 \n" + "ld1 {v0.s}[2], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], r12 \n" + "ld1 {v0.s}[3], [%0], %3 \n" "subs %2, %2, #4 \n" // 4 pixels per loop. MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" + : "r"(static_cast(src_stepx * 4)) // %3 + : "memory", "cc", "v0" ); } -#endif //HAS_SCALEARGBROWDOWNEVEN_NEON -#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. +// TODO, might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( - "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 MEMACCESS(1) - "vld1.8 {d1}, [%1], r12 \n" + "ld1 {v1.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d2}, [%0], r12 \n" + "ld1 {v2.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d3}, [%1], r12 \n" + "ld1 {v3.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d4}, [%0], r12 \n" + "ld1 {v4.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d5}, [%1], r12 \n" + "ld1 {v5.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d6}, [%0], r12 \n" + "ld1 {v6.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "subs %3, %3, #4 \n" // 4 pixels per loop. MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + : "r"(src_stepx * 4) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } -#endif // HAS_SCALEARGBROWDOWNEVEN_NEON -#endif // __aarch64__ +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc index 352e66782..bb6e57efe 100644 --- a/third_party/libyuv/source/scale_posix.cc +++ b/third_party/libyuv/source/scale_posix.cc @@ -101,24 +101,20 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + :: "memory", "cc", "xmm0", "xmm1" ); } @@ -130,8 +126,8 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "psrlw $0x8,%%xmm0 \n" @@ -142,18 +138,14 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "pavgw %%xmm2,%%xmm0 \n" "pavgw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm5" ); } @@ -163,118 +155,11 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 - "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 - BUNDLEALIGN MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" @@ -296,13 +181,8 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" ); } @@ -315,8 +195,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" @@ -330,11 +210,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm5" ); } @@ -348,18 +224,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 - MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 - MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 + MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm4,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" @@ -388,13 +262,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "+r"(dst_width), // %2 "+r"(stridex3) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" ); } @@ -412,8 +281,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm2,%%xmm1 \n" "palignr $0x8,%%xmm0,%%xmm1 \n" @@ -429,11 +298,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif + :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -461,8 +326,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm6 \n" @@ -479,9 +344,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n" @@ -498,13 +362,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -533,8 +392,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "movdqu " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7 "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" @@ -553,8 +412,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6," MEMACCESS2(0x8,1) " \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" - MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" @@ -572,13 +431,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -590,8 +444,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -607,10 +461,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, "+r"(dst_width) // %2 : "m"(kShuf38a), // %3 "m"(kShuf38b) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm4", "xmm5" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" ); } @@ -631,9 +482,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" + "pavgb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" @@ -643,23 +495,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, "paddusw %%xmm0,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "sub $0x6,%2 \n" "movd %%xmm1," MEMACCESS(1) " \n" "psrlq $0x10,%%xmm1 \n" "movd %%xmm1," MEMACCESS2(0x2,1) " \n" "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -679,8 +526,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6 "movhlps %%xmm0,%%xmm1 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm0 \n" @@ -689,7 +536,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, "punpcklbw %%xmm5,%%xmm7 \n" "paddusw %%xmm6,%%xmm0 \n" "paddusw %%xmm7,%%xmm1 \n" - MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6 "lea " MEMLEA(0x10,0) ",%0 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm6 \n" @@ -711,23 +558,18 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, "paddusw %%xmm7,%%xmm6 \n" "pmulhuw %%xmm4,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" - "sub $0x6,%2 \n" "movd %%xmm6," MEMACCESS(1) " \n" "psrlq $0x10,%%xmm6 \n" "movd %%xmm6," MEMACCESS2(0x2,1) " \n" "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } @@ -741,7 +583,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "mov %0,%3 \n" "add %6,%0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -753,7 +595,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "2: \n" - "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "add %6,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" @@ -765,8 +607,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, LABELALIGN "3: \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x10,3) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x10,%4 \n" @@ -778,10 +620,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "+r"(src_width), // %4 "+rm"(src_height) // %5 : "rm"((intptr_t)(src_stride)) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } @@ -813,7 +652,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" - BUNDLEALIGN MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 "movd %k2,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" @@ -853,13 +691,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "+rm"(dst_width) // %5 : "rm"(x), // %6 "rm"(dx) // %7 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -870,25 +703,21 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" - "sub $0x20,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x20,%2 \n" "jg 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + :: "memory", "cc", "xmm0", "xmm1" ); } @@ -898,22 +727,18 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + :: "memory", "cc", "xmm0", "xmm1" ); } @@ -923,25 +748,21 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + :: "memory", "cc", "xmm0", "xmm1" ); } @@ -951,11 +772,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm0 \n" - "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" - BUNDLEALIGN - MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 - MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" @@ -963,21 +783,16 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" ); } @@ -996,29 +811,22 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, "movd " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 "punpckldq %%xmm1,%%xmm0 \n" - BUNDLEALIGN MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" "punpckldq %%xmm3,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 "+r"(dst_width), // %3 "+r"(src_stepx_x12) // %4 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" ); } @@ -1040,11 +848,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "movq " MEMACCESS(0) ",%%xmm0 \n" MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 - BUNDLEALIGN MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" "movq " MEMACCESS(5) ",%%xmm2 \n" - BUNDLEALIGN MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 @@ -1055,9 +861,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 @@ -1065,14 +871,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "+rm"(dst_width), // %3 "+r"(src_stepx_x12), // %4 "+r"(row1) // %5 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3" ); } @@ -1111,15 +911,14 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, "pextrw $0x3,%%xmm2,%k1 \n" "punpckldq %%xmm4,%%xmm1 \n" "punpcklqdq %%xmm1,%%xmm0 \n" - "sub $0x4,%4 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" "jge 40b \n" "49: \n" "test $0x2,%4 \n" "je 29f \n" - BUNDLEALIGN MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 "pextrw $0x5,%%xmm2,%k0 \n" @@ -1139,13 +938,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, "+r"(dst_width) // %4 : "rm"(x), // %5 "rm"(dx) // %6 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } @@ -1156,28 +950,22 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, asm volatile ( LABELALIGN "1: \n" - "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpckldq %%xmm0,%%xmm0 \n" "punpckhdq %%xmm1,%%xmm1 \n" - "sub $0x8,%2 \n" - "movdqa %%xmm0," MEMACCESS(0) " \n" - "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "movdqu %%xmm0," MEMACCESS(0) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif + :: "memory", "cc", NACL_R14 + "xmm0", "xmm1" ); } @@ -1225,7 +1013,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, "paddd %%xmm3,%%xmm2 \n" MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 "psrlw $0x9,%%xmm1 \n" - BUNDLEALIGN MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 "pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" @@ -1245,7 +1032,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, "add $0x1,%2 \n" "jl 99f \n" "psrlw $0x9,%%xmm2 \n" - BUNDLEALIGN MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 "pshufb %%xmm5,%%xmm2 \n" "pshufb %%xmm4,%%xmm0 \n" @@ -1264,13 +1050,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, "+r"(x1) // %4 : "rm"(x), // %5 "rm"(dx) // %6 - : "memory", "cc" -#if defined(__native_client__) && defined(__x86_64__) - , "r14" -#endif -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc index 840b9738d..e0209cdec 100644 --- a/third_party/libyuv/source/scale_win.cc +++ b/third_party/libyuv/source/scale_win.cc @@ -103,17 +103,16 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop ret @@ -133,10 +132,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 // average columns (32 to 16 pixels) @@ -149,9 +147,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pavgw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop ret @@ -172,120 +170,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - align 4 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 4 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -305,9 +189,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, pavgw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop pop esi @@ -329,19 +213,18 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, psrld xmm5, 24 pslld xmm5, 16 - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop ret @@ -364,18 +247,17 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] + movdqu xmm2, [eax + esi * 2] + movdqu xmm3, [eax + esi * 2 + 16] + movdqu xmm4, [eax + edi] + movdqu xmm5, [eax + edi + 16] lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 @@ -398,9 +280,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pavgw xmm0, xmm2 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop pop edi @@ -427,10 +309,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, movdqa xmm4, kShuf1 movdqa xmm5, kShuf2 - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 @@ -481,10 +362,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, movdqa xmm6, kMadd11 movdqa xmm7, kRound34 - align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 @@ -501,8 +381,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 @@ -511,9 +391,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx + 24] + sub ecx, 24 jg wloop pop esi @@ -540,10 +420,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, movdqa xmm6, kMadd11 movdqa xmm7, kRound34 - align 4 wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] + movdqu xmm0, [eax] // pixels 0..7 + movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -562,8 +441,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] + movdqu xmm0, [eax + 16] // pixels 16..23 + movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 @@ -573,9 +452,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx+24] + sub ecx, 24 jg wloop pop esi @@ -597,20 +476,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, movdqa xmm4, kShuf38a movdqa xmm5, kShuf38b - align 4 xloop: - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 - sub ecx, 12 movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] + sub ecx, 12 jg xloop ret @@ -633,10 +511,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, movdqa xmm4, kScaleAc33 pxor xmm5, xmm5 - align 4 xloop: - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqa xmm6, [eax + esi] + movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 punpcklbw xmm0, xmm5 @@ -645,7 +522,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 - movdqa xmm6, [eax + esi * 2] + movdqu xmm6, [eax + esi * 2] lea eax, [eax + 16] movhlps xmm7, xmm6 punpcklbw xmm6, xmm5 @@ -671,11 +548,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 - sub ecx, 6 movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] + sub ecx, 6 jg xloop pop esi @@ -699,11 +576,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, movdqa xmm4, kShufAb2 movdqa xmm5, kScaleAb2 - align 4 xloop: - movdqa xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] + movdqu xmm0, [eax] // average 2 rows into xmm0 + movdqu xmm1, [eax + esi] lea eax, [eax + 16] + pavgb xmm0, xmm1 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 @@ -716,11 +593,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 - sub ecx, 6 movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] + sub ecx, 6 jg xloop pop esi @@ -747,10 +624,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pxor xmm4, xmm4 dec ebx - align 4 xloop: // first row - movdqa xmm0, [esi] + movdqu xmm0, [esi] lea eax, [esi + edx] movdqa xmm1, xmm0 punpcklbw xmm0, xmm4 @@ -761,9 +637,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, je ydone // sum remaining rows - align 4 yloop: - movdqa xmm2, [eax] // read 16 pixels + movdqu xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row movdqa xmm3, xmm2 punpcklbw xmm2, xmm4 @@ -773,10 +648,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, sub ebp, 1 jg yloop - align 4 ydone: - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 + movdqu [edi], xmm0 + movdqu [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 @@ -828,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. - align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx @@ -851,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, sub ecx, 2 // 2 pixels jge xloop2 - align 4 xloop29: add ecx, 2 - 1 @@ -869,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd ebx, xmm0 mov [edi], bl - align 4 xloop99: pop edi @@ -889,17 +760,16 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, mov eax, [esp + 8] // src_ptr mov ecx, [esp + 12] // dst_width - align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 - sub ecx, 32 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 32 jg wloop ret @@ -918,15 +788,14 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop ret @@ -945,18 +814,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop ret @@ -976,12 +844,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // dst_width - align 4 wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 @@ -989,9 +856,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop esi @@ -1016,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] - align 4 wloop: movd xmm0, [eax] movd xmm1, [eax + ebx] @@ -1026,9 +892,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, lea eax, [eax + ebx * 4] punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop edi @@ -1057,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] - align 4 wloop: movq xmm0, qword ptr [eax] // row0 4 pairs movhps xmm0, qword ptr [eax + ebx] @@ -1075,9 +940,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 + movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop edi @@ -1118,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, jl xloop49 // 4 Pixel loop. - align 4 xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels @@ -1133,12 +997,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, pextrw edx, xmm2, 3 // get x1 integer. next iteration. punpckldq xmm1, xmm4 // x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - sub ecx, 4 // 4 pixels movdqu [edi], xmm0 lea edi, [edi + 16] + sub ecx, 4 // 4 pixels jge xloop4 - align 4 xloop49: test ecx, 2 je xloop29 @@ -1159,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 - align 4 xloop99: pop esi @@ -1209,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. - align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx @@ -1229,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, sub ecx, 2 // 2 pixels jge xloop2 - align 4 xloop29: add ecx, 2 - 1 @@ -1246,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 - align 4 xloop99: pop edi @@ -1265,17 +1124,16 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, mov eax, [esp + 8] // src_argb mov ecx, [esp + 12] // dst_width - align 4 wloop: - movdqa xmm0, [eax] + movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 - sub ecx, 8 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg wloop ret diff --git a/third_party/libyuv/source/video_common.cc b/third_party/libyuv/source/video_common.cc index efbedf46e..379a0669a 100644 --- a/third_party/libyuv/source/video_common.cc +++ b/third_party/libyuv/source/video_common.cc @@ -33,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = { {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. {FOURCC_DMB1, FOURCC_MJPG}, - {FOURCC_BA81, FOURCC_BGGR}, + {FOURCC_BA81, FOURCC_BGGR}, // deprecated. {FOURCC_RGB3, FOURCC_RAW }, {FOURCC_BGR3, FOURCC_24BG}, {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB