update libyuv to r1456

picks up build warning fixes for visual studio 2015

Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520
This commit is contained in:
James Zern 2015-07-24 16:54:51 -07:00
parent f42012e526
commit fcb4253c9c
46 changed files with 5400 additions and 2955 deletions

View File

@ -22,17 +22,18 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/source/planar_functions.cc \
third_party/libyuv/source/row_any.cc \
third_party/libyuv/source/row_common.cc \
third_party/libyuv/source/row_gcc.cc \
third_party/libyuv/source/row_mips.cc \
third_party/libyuv/source/row_neon.cc \
third_party/libyuv/source/row_neon64.cc \
third_party/libyuv/source/row_posix.cc \
third_party/libyuv/source/row_win.cc \
third_party/libyuv/source/scale.cc \
third_party/libyuv/source/scale_any.cc \
third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_gcc.cc \
third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \
third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_posix.cc \
third_party/libyuv/source/scale_win.cc \
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1305
Version: 1456
License: BSD
License File: LICENSE
@ -13,4 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'

View File

@ -71,6 +71,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v,
int width, int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y,

View File

@ -68,20 +68,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Convert I400 (grey) to ARGB.
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alias.
#define YToARGB I400ToARGB_Reference
// Convert I400 to ARGB. Reverse of ARGBToI400.
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
// Alias.
#define YToARGB I400ToARGB
// Convert NV12 to ARGB.
LIBYUV_API

View File

@ -137,6 +137,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame,
int width, int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
const uint8* dither4x4, int width, int height);
LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,

View File

@ -61,12 +61,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height);
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
// Values in dither matrix from 0 to 255. 128 is best for no dither.
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height);
const uint8* dither4x4, int width, int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
@ -140,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
int ARGBToG(const uint8* src_argb, int src_stride_argb,
uint8* dst_g, int dst_stride_g,
int width, int height);
// Convert ARGB To NV12.
LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

View File

@ -45,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422.
#define I422ToI422 I422Copy
@ -84,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_v, int dst_stride_v,
int width, int height);
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y,
@ -93,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
int width, int height);
// Alias
#define J420ToJ400 I420ToI400
#define I420ToI420Mirror I420Mirror
// I420 mirror.
@ -387,24 +401,24 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBAFFINEROW_SSE2
#endif
// Row functions for copying a pixels from a source with a slope to a row
// Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width);
#define HAS_ARGBAFFINEROW_SSE2
#endif // LIBYUV_DISABLE_X86
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.

View File

@ -0,0 +1,138 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT
#define INCLUDE_LIBYUV_ROTATE_ROW_H_
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif
#endif
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
// The following are available for 32 bit GCC:
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for 64 bit GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
#define HAS_TRANSPOSEWX8_FAST_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_MIPS_DSPR2
#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height);
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT

View File

@ -37,10 +37,8 @@ extern "C" {
free(var##_mem); \
var = 0
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR) || \
(defined(__i386__) && !defined(__SSE2__)) || \
(defined(_MSC_VER) && defined(__clang__))
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// True if compiling for SSSE3 as a requirement.
@ -48,6 +46,9 @@ extern "C" {
#define LIBYUV_SSSE3_ONLY
#endif
#if defined(__native_client__)
#define LIBYUV_DISABLE_NEON
#endif
// clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@ -63,11 +64,11 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBSETROW_X86
#define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTOBAYERGGROW_SSE2
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2
@ -95,7 +96,8 @@ extern "C" {
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
// #define HAS_J422TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3
@ -112,15 +114,13 @@ extern "C" {
#define HAS_RGB565TOARGBROW_SSE2
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
#define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86
#define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2
#define HAS_UYVYTOYROW_SSE2
#define HAS_YTOARGBROW_SSE2
#define HAS_YUY2TOARGBROW_SSSE3
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
@ -157,8 +157,9 @@ extern "C" {
#define HAS_SOBELYROW_SSE2
#endif
// The following are available on x64 Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
// The following are available on x64 Visual C and clangcl.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3
#endif
@ -177,27 +178,31 @@ extern "C" {
#endif // __clang__
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available require VS2012. Port to GCC.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_ARGB1555TOARGBROW_AVX2
#define HAS_ARGB4444TOARGBROW_AVX2
#define HAS_ARGBTOARGB1555ROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available on all x86 platforms, but
@ -214,24 +219,27 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORAWROW_AVX2
#define HAS_I422TORGB24ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
#define HAS_YTOARGBROW_AVX2
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
// The following require HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2)
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#endif
// Effects:
#define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
@ -240,22 +248,6 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_AVX2
#endif
// The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
(defined(_M_IX86) || defined(_M_X64) || \
defined(__x86_64__) || defined(__i386__))
#define HAS_MERGEUVROW_AVX2
#define HAS_MERGEUVROW_MMX
#define HAS_SPLITUVROW_AVX2
#define HAS_SPLITUVROW_MMX
#define HAS_UYVYTOYROW_AVX2
#define HAS_UYVYTOYROW_MMX
#define HAS_YUY2TOYROW_AVX2
#define HAS_YUY2TOYROW_MMX
#endif
// The following are disabled when SSSE3 is available:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
@ -278,7 +270,6 @@ extern "C" {
#define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOBAYERGGROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
@ -292,7 +283,7 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
@ -331,11 +322,12 @@ extern "C" {
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YTOARGBROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
// Effects:
#define HAS_ARGBADDROW_NEON
@ -388,7 +380,6 @@ typedef __declspec(align(32)) int8 lvec8[32];
typedef __declspec(align(32)) uint16 ulvec16[16];
typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32];
#elif defined(__GNUC__)
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
@ -869,6 +860,11 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
@ -884,12 +880,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix);
void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
int pix);
void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
@ -905,6 +909,13 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@ -914,6 +925,8 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@ -922,14 +935,13 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
@ -1038,6 +1050,11 @@ void I444ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void I444ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1048,6 +1065,11 @@ void I411ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void I411ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void NV12ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@ -1097,6 +1119,11 @@ void J422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void J422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1147,11 +1174,21 @@ void I422ToRGB24Row_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_rgb24,
int width);
void I422ToRGB24Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
int width);
void I422ToRAWRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_raw,
int width);
void I422ToRAWRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_raw,
int width);
void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1177,6 +1214,11 @@ void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void I444ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1187,6 +1229,11 @@ void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void I411ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
@ -1231,6 +1278,16 @@ void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
uint8* dst_argb,
int width);
void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void J422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1281,33 +1338,29 @@ void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToRAWRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void YToARGBRow_C(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@ -1375,6 +1428,11 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@ -1384,6 +1442,8 @@ void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
void I444ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u,
@ -1570,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1770,6 +1819,18 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width);
void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width);
void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void ARGBPolynomialRow_C(const uint8* src_argb,
uint8* dst_argb, const float* poly,

View File

@ -12,45 +12,66 @@
#define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h"
#include "libyuv/scale.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
defined(TARGET_IPHONE_SIMULATOR)
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN4_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEADDROWS_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALECOLSUP2_SSE2
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_FIXEDDIV_X86
#define HAS_FIXEDDIV1_X86
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEROWDOWN2_SSE2
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSE2
#endif
// The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
#define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBFILTERCOLS_NEON
#endif
// The following are available on Mips platforms:
@ -164,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height);
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height);
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
@ -194,16 +213,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@ -220,46 +251,124 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height);
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
// ARGB Column functions
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// Row functions.
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
@ -267,7 +376,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
@ -302,6 +412,42 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32 -> 12
void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1305
#define LIBYUV_VERSION 1456
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -37,7 +37,7 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
#define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#if _MSC_VER >= 1700
#ifdef VISUALC_HAS_AVX2
#define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif
@ -138,8 +138,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
// Visual C 2012 required for AVX2.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
#ifdef VISUALC_HAS_AVX2
#define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif

View File

@ -32,7 +32,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"

View File

@ -16,9 +16,11 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked) __declspec(align(16))
__declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
@ -59,7 +61,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752)
__declspec(naked) __declspec(align(16))
__declspec(naked)
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
@ -133,7 +135,7 @@ static uvec32 kHashMul3 = {
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16))
__declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
@ -184,7 +186,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
__declspec(naked) __declspec(align(16))
__declspec(naked)
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm {
mov eax, [esp + 4] // src
@ -219,8 +221,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
}
}
#endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"

View File

@ -817,22 +817,20 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
// Neon version does direct RGB24 to YUV.
#if defined(HAS_RGB24TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
RGB24ToYRow = RGB24ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
}
#endif
#if defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@ -841,27 +839,29 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{
#if !defined(HAS_RGB24TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
@ -894,8 +894,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
#if !defined(HAS_RGB24TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
}
#endif
return 0;
}
@ -931,22 +931,20 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
src_stride_raw = -src_stride_raw;
}
// Neon version does direct RAW to YUV.
#if defined(HAS_RAWTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
RAWToYRow = RAWToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
}
}
#endif
#if defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
}
#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@ -955,59 +953,63 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RAWTOYROW_NEON)
#if defined(HAS_RAWTOYROW_NEON)
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
#else
#else
RAWToARGBRow(src_raw, row, width);
RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif
#endif
src_raw += src_stride_raw * 2;
dst_y += dst_stride_y * 2;
dst_u += dst_stride_u;
dst_v += dst_stride_v;
}
if (height & 1) {
#if defined(HAS_RAWTOYROW_NEON)
#if defined(HAS_RAWTOYROW_NEON)
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
#else
#else
RAWToARGBRow(src_raw, row, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width);
#endif
#endif
}
#if !defined(HAS_RAWTOYROW_NEON)
#if !defined(HAS_RAWTOYROW_NEON)
free_aligned_buffer_64(row);
#endif
}
#endif
return 0;
}
@ -1043,19 +1045,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = -src_stride_rgb565;
}
// Neon version does direct RGB565 to YUV.
#if defined(HAS_RGB565TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
RGB565ToYRow = RGB565ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB565ToYRow = RGB565ToYRow_NEON;
}
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB565ToUVRow = RGB565ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) {
RGB565ToUVRow = RGB565ToUVRow_NEON;
}
}
}
#else // HAS_RGB565TOYROW_NEON
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
@ -1064,28 +1067,37 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
#if defined(HAS_RGB565TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_RGB565TOYROW_NEON
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{
#if !defined(HAS_RGB565TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
@ -1118,8 +1130,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
#if !defined(HAS_RGB565TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
}
#endif
return 0;
}
@ -1155,19 +1167,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = -src_stride_argb1555;
}
// Neon version does direct ARGB1555 to YUV.
#if defined(HAS_ARGB1555TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB1555ToYRow = ARGB1555ToYRow_NEON;
}
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
}
}
}
#else // HAS_ARGB1555TOYROW_NEON
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
@ -1176,30 +1189,40 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB1555TOYROW_NEON
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{
#if !defined(HAS_ARGB1555TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB1555TOYROW_NEON)
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
@ -1230,9 +1253,9 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#endif
}
#if !defined(HAS_ARGB1555TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
free_aligned_buffer_64(row);
}
#endif
return 0;
}
@ -1268,19 +1291,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = -src_stride_argb4444;
}
// Neon version does direct ARGB4444 to YUV.
#if defined(HAS_ARGB4444TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGB4444ToYRow = ARGB4444ToYRow_NEON;
}
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
}
}
}
#else // HAS_ARGB4444TOYROW_NEON
// Other platforms do intermediate conversion from ARGB4444 to ARGB.
#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
@ -1289,28 +1313,37 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif // HAS_ARGBTOUVROW_SSSE3
#endif // HAS_ARGB4444TOYROW_NEON
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{
#if !defined(HAS_ARGB4444TOYROW_NEON)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15;
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
@ -1345,8 +1378,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
}
#if !defined(HAS_ARGB4444TOYROW_NEON)
free_aligned_buffer_64(row);
#endif
}
#endif
return 0;
}

View File

@ -85,6 +85,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I444ToARGBRow = I444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I444ToARGBRow = I444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I444ToARGBRow = I444ToARGBRow_Any_NEON;
@ -222,6 +230,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I411TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I411ToARGBRow = I411ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I411ToARGBRow = I411ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I411TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I411ToARGBRow = I411ToARGBRow_Any_NEON;
@ -243,13 +259,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
// Convert I400 to ARGB.
LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*YToARGBRow)(const uint8* y_buf,
void (*I400ToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
int width) = YToARGBRow_C;
int width) = I400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@ -267,47 +283,47 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_YTOARGBROW_SSE2)
#if defined(HAS_I400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
YToARGBRow = YToARGBRow_Any_SSE2;
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2;
I400ToARGBRow = I400ToARGBRow_SSE2;
}
}
#endif
#if defined(HAS_YTOARGBROW_AVX2)
#if defined(HAS_I400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YToARGBRow = YToARGBRow_Any_AVX2;
I400ToARGBRow = I400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
YToARGBRow = YToARGBRow_AVX2;
I400ToARGBRow = I400ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_YTOARGBROW_NEON)
#if defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON;
I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON;
I400ToARGBRow = I400ToARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
YToARGBRow(src_y, dst_argb, width);
I400ToARGBRow(src_y, dst_argb, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
}
return 0;
}
// Convert I400 to ARGB.
// Convert J400 to ARGB.
LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y,
int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
int y;
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
I400ToARGBRow_C;
void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
J400ToARGBRow_C;
if (!src_y || !dst_argb ||
width <= 0 || height == 0) {
return -1;
@ -325,24 +341,32 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_I400TOARGBROW_SSE2)
#if defined(HAS_J400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2;
J400ToARGBRow = J400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_SSE2;
J400ToARGBRow = J400ToARGBRow_SSE2;
}
}
#endif
#if defined(HAS_I400TOARGBROW_NEON)
#if defined(HAS_J400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J400ToARGBRow = J400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J400ToARGBRow = J400ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON;
J400ToARGBRow = J400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON;
J400ToARGBRow = J400ToARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width);
J400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y;
dst_argb += dst_stride_argb;
}
@ -552,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
}
}
#endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB565TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
@ -602,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
}
}
#endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
@ -652,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
}
}
#endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB4444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;

View File

@ -739,6 +739,14 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I422TORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRGB24Row = I422ToRGB24Row_AVX2;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@ -791,6 +799,14 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
}
}
#endif
#if defined(HAS_I422TORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRAWRow = I422ToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRAWRow = I422ToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON;
@ -993,6 +1009,117 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
return 0;
}
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
0, 4, 1, 5,
6, 2, 7, 3,
1, 5, 0, 4,
7, 3, 6, 2,
};
// Convert I420 to RGB565 with dithering.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
if (!dither4x4) {
dither4x4 = kDither565_4x4;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
{
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
free_aligned_buffer_64(row_argb);
}
return 0;
}
// Convert I420 to specified format
LIBYUV_API
int ConvertFromI420(const uint8* y, int y_stride,

View File

@ -72,7 +72,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -139,7 +146,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@ -148,6 +154,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -275,6 +289,16 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -317,8 +341,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@ -374,6 +398,16 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -416,8 +450,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15);
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@ -492,6 +526,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -591,6 +633,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -804,25 +854,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
return 0;
}
static const uint8 kDither8x8[64] = {
0, 128, 32, 160, 8, 136, 40, 168,
192, 64, 224, 96, 200, 72, 232, 104,
48, 176, 16, 144, 56, 184, 24, 152,
240, 112, 208, 80, 248, 120, 216, 88,
12, 140, 44, 172, 4, 132, 36, 164,
204, 76, 236, 108, 196, 68, 228, 100,
60, 188, 28, 156, 52, 180, 20, 148,
252, 124, 220, 92, 244, 116, 212, 84,
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
0, 4, 1, 5,
6, 2, 7, 3,
1, 5, 0, 4,
7, 3, 6, 2,
};
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height) {
const uint8* dither4x4, int width, int height) {
int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@ -831,13 +878,36 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
if (!dither8x8) {
dither8x8 = kDither8x8;
if (!dither4x4) {
dither4x4 = kDither565_4x4;
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
dither8x8 + ((y & 7) << 3), width);
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565;
}
@ -845,6 +915,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
}
// Convert ARGB To RGB565.
// TODO(fbarchard): Consider using dither function low level with zeros.
LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565,
@ -1021,7 +1092,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
int width, int height) {
int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
if (!src_argb ||
@ -1045,7 +1116,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@ -1140,6 +1211,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON;

View File

@ -10,13 +10,12 @@
#include "libyuv/cpu_id.h"
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \
(defined(_M_IX86) || defined(_M_X64))
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
@ -37,23 +36,23 @@ extern "C" {
// For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check.
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers)
#else
#define SAFEBUFFERS
#endif
// Low level cpuid for X86. Returns zeros on other CPUs.
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
(defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__))
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER) && !defined(__clang__)
#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
// Visual C version uses intrinsic or inline x86 assembly.
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#endif
#if defined(_M_IX86)
#elif defined(_M_IX86)
__asm {
mov eax, info_eax
mov ecx, info_ecx
@ -71,7 +70,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
}
#endif
#else // defined(_MSC_VER)
// GCC version uses inline x86 assembly.
#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
uint32 info_ebx, info_edx;
asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__)
@ -89,37 +89,38 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx;
cpu_info[3] = info_edx;
#endif // defined(_MSC_VER)
#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
}
#if !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#endif
#if defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#endif
#if defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // !defined(__native_client__)
#else
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
#endif
// TODO(fbarchard): Enable xgetbv when validator supports it.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS

View File

@ -18,6 +18,12 @@
// Must be included before jpeglib.
#include <setjmp.h>
#define HAVE_SETJMP
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#endif
#endif
struct FILE; // For jpeglib.h.

View File

@ -23,7 +23,7 @@ extern "C" {
#ifdef ENABLE_SCASB
// Multiple of 1.
__declspec(naked) __declspec(align(16))
__declspec(naked)
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm {
mov edx, edi

View File

@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
return 0;
}
// Get a blender that optimized for the CPU, alignment and pixel count.
// Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible.
LIBYUV_API
@ -677,12 +677,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
height = 1;
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_SSE2;
}
#endif
#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
const uint8* src_sobely,
uint8* dst, int width)) {
int y;
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) = ARGBToBayerGGRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
ARGBToYJRow_C;
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) = SobelYRow_C;
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON)
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_NEON;
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelYRow = SobelYRow_SSE2;
@ -2040,7 +2041,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
#endif
{
// 3 rows with edges before/after.
const int kRowSize = (width + kEdge + 15) & ~15;
const int kRowSize = (width + kEdge + 31) & ~31;
align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
uint8* row_sobelx = rows;
uint8* row_sobely = rows + kRowSize;
@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
uint8* row_y0 = row_y + kEdge;
uint8* row_y1 = row_y0 + kRowSize;
uint8* row_y2 = row_y1 + kRowSize;
ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
ARGBToYJRow(src_argb, row_y1, width);
row_y1[-1] = row_y1[0];
memset(row_y1 + width, row_y1[width - 1], 16);
memset(row_y2 + width, 0, 16);
for (y = 0; y < height; ++y) {
// Convert next row of ARGB to Y.
// Convert next row of ARGB to G.
if (y < (height - 1)) {
src_argb += src_stride_argb;
}
ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
ARGBToYJRow(src_argb, row_y2, width);
row_y2[-1] = row_y2[0];
row_y2[width] = row_y2[width - 1];
@ -2094,13 +2095,19 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
SobelRow = SobelRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
}
#endif
#if defined(HAS_SOBELROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
SobelRow = SobelRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
SobelRow = SobelRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
SobelRow = SobelRow_NEON;
}
}
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@ -2115,13 +2122,19 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
}
#endif
#if defined(HAS_SOBELTOPLANEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_NEON;
}
}
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
@ -2137,13 +2150,19 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXYRow = SobelXYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
}
#endif
#if defined(HAS_SOBELXYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
SobelXYRow = SobelXYRow_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
SobelXYRow = SobelXYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
SobelXYRow = SobelXYRow_NEON;
}
}
#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@ -2322,6 +2341,214 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
return 0;
}
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_yuy2 ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
#if defined(HAS_SPLITUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
{
int awidth = halfwidth * 2;
// 2 rows of uv
align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
SplitUVRow(src_yuy2, dst_y, rows, awidth);
SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
rows + awidth, awidth);
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_yuy2 += src_stride_yuy2 * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
// Split Y from UV.
SplitUVRow(src_yuy2, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
return 0;
}
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_uyvy ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
#if defined(HAS_SPLITUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
{
int awidth = halfwidth * 2;
// 2 rows of uv
align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
SplitUVRow(src_uyvy, rows, dst_y, awidth);
SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
dst_y + dst_stride_y, awidth);
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_uyvy += src_stride_uyvy * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
// Split Y from UV.
SplitUVRow(src_uyvy, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
return 0;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -13,6 +13,7 @@
#include "libyuv/cpu_id.h"
#include "libyuv/convert.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@ -20,809 +21,39 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE_WX8_NEON
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWX8_NEON
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif // defined(__mips__)
#if !defined(LIBYUV_DISABLE_X86) && \
defined(_M_IX86) && defined(_MSC_VER)
#define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
#define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked) __declspec(align(16))
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
#define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(w) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
}
#endif
#endif
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
}
static void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
for (j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
}
}
}
LIBYUV_API
void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSE_WX8_NEON)
uint8* dst, int dst_stride, int width) = TransposeWx8_C;
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
}
#endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_SSSE3;
#if defined(HAS_TRANSPOSEWX8_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_SSSE3;
}
}
#endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3;
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_Fast_SSSE3;
}
}
#endif
#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
} else {
TransposeWx8 = TransposeWx8_MIPS_DSPR2;
}
@ -837,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride,
i -= 8;
}
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
if (i > 0) {
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
}
}
LIBYUV_API
@ -955,48 +188,6 @@ void RotatePlane180(const uint8* src, int src_stride,
free_aligned_buffer_64(row);
}
static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
}
}
static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
}
LIBYUV_API
void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
@ -1007,17 +198,17 @@ void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#if defined(HAS_TRANSPOSE_UVWX8_NEON)
#if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON;
}
#endif
#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2;
}
#endif
#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@ -1036,10 +227,12 @@ void TransposeUV(const uint8* src, int src_stride,
i -= 8;
}
TransposeUVWxH_C(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, i);
if (i > 0) {
TransposeUVWxH_C(src, src_stride,
dst_a, dst_stride_a,
dst_b, dst_stride_b,
width, i);
}
}
LIBYUV_API

55
third_party/libyuv/source/rotate_any.cc vendored Normal file
View File

@ -0,0 +1,55 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
#endif
#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
#endif
#undef TANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -27,24 +27,20 @@ extern "C" {
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx,
uint8* dst_ptr, int dst_width);
int src_stepx, uint8* dst_ptr, int dst_width);
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx,
uint8* dst_ptr, int dst_width);
int src_stepx, uint8* dst_ptr, int dst_width);
#endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx,
uint8* dst_ptr, int dst_width);
int src_stepx, uint8* dst_ptr, int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
uint8* dst, int dst_stride, int width, int height) {
int i;
int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
@ -68,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
}
void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
uint8* dst, int dst_stride, int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
@ -79,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
}
void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
uint8* dst, int dst_stride, int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
@ -90,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
}
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
uint8* dst, int dst_stride, int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1);
@ -166,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height,
uint8* dst_argb, int dst_stride_argb, int width, int height,
enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1;

View File

@ -0,0 +1,92 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
}
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
}
}
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
for (j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
}
}
}
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

493
third_party/libyuv/source/rotate_gcc.cc vendored Normal file
View File

@ -0,0 +1,493 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
}
#endif
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
@ -22,8 +23,7 @@ extern "C" {
(_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@ -106,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
);
}
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm__ __volatile__ (
".set noat \n"
".set push \n"

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
@ -21,11 +22,10 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
uint8* dst, int dst_stride, int width) {
const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile (

248
third_party/libyuv/source/rotate_win.cc vendored Normal file
View File

@ -0,0 +1,248 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
__declspec(naked)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@ -199,28 +199,36 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
}
}
// dither4 is a row of 4 values from 4x4 dither matrix.
// The 4x4 matrix contains values to increase RGB. When converting to
// fewer bits (565) this provides an ordered dither.
// The order in the 4x4 matrix in first byte is upper left.
// The 4 values are passed as an int, then referenced as an array, so
// endian will not affect order of the original matrix. But the dither4
// will containing the first pixel in the lower byte for little endian
// or the upper byte for big endian.
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int width) {
const uint32 dither4, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
int dither0 = dither8x8[x & 7] - 128;
int dither1 = dither8x8[(x & 7) + 1] - 128;
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
int dither0 = ((const unsigned char*)(&dither4))[x & 3];
int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
(b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
int dither0 = dither8x8[(width - 1) & 7] - 128;
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
*(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
}
}
@ -974,7 +982,7 @@ void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
}
}
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// Copy a Y to RGB.
int x;
for (x = 0; x < width; ++x) {
@ -986,38 +994,42 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
}
}
// YUV to RGB conversion constants.
// BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */
#define UB -128 /* max(-128, round(-2.018 * 64)) */
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR (VR * 128 - YGB)
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// C reference code that mimics the YUV assembly.
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(BB - ( u * UB) + y1) >> 6);
*g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
*r = Clamp((int32)(BR - (v * VR ) + y1) >> 6);
*b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
*g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
*r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
}
// C reference code that mimics the YUV assembly.
static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(y1 - YGB) >> 6);
*g = Clamp((int32)(y1 - YGB) >> 6);
*r = Clamp((int32)(y1 - YGB) >> 6);
*b = Clamp((int32)(y1 + YGB) >> 6);
*g = Clamp((int32)(y1 + YGB) >> 6);
*r = Clamp((int32)(y1 + YGB) >> 6);
}
#undef YG
@ -1030,6 +1042,46 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
#undef BG
#undef BR
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y - U * -1.77200
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YGBJ 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UBJ -113 /* round(-1.77200 * 64) */
#define UGJ 22 /* round(0.34414 * 64) */
#define VGJ 46 /* round(0.71414 * 64) */
#define VRJ -90 /* round(-1.40200 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BBJ (UBJ * 128 + YGBJ)
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
#define BRJ (VRJ * 128 + YGBJ)
// C reference code that mimics the YUV assembly.
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
*b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
*g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
*r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
}
#undef YGJ
#undef YGBJ
#undef UBJ
#undef UGJ
#undef VGJ
#undef VRJ
#undef BBJ
#undef BGJ
#undef BRJ
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
@ -1102,34 +1154,6 @@ void I422ToARGBRow_C(const uint8* src_y,
}
}
// C reference code that mimics the YUV assembly.
// * R = Y + 1.40200 * Cr
// * G = Y - 0.34414 * Cb - 0.71414 * Cr
// * B = Y + 1.77200 * Cb
#define YGJ 64 /* (int8)round(1.000 * 64) */
#define UBJ 113 /* (int8)round(1.772 * 64) */
#define UGJ -22 /* (int8)round(-0.34414 * 64) */
#define URJ 0
#define VBJ 0
#define VGJ -46 /* (int8)round(-0.71414 * 64) */
#define VRJ 90 /* (int8)round(1.402 * 64) */
// Bias
#define BBJ (UBJ * 128 + VBJ * 128)
#define BGJ (UGJ * 128 + VGJ * 128)
#define BRJ (URJ * 128 + VRJ * 128)
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * YGJ);
*b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
*g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
*r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
}
void J422ToARGBRow_C(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@ -1354,23 +1378,23 @@ void I411ToARGBRow_C(const uint8* src_y,
}
void NV12ToARGBRow_C(const uint8* src_y,
const uint8* usrc_v,
const uint8* src_uv,
uint8* rgb_buf,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
YuvPixel(src_y[0], src_uv[0], src_uv[1],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
YuvPixel(src_y[1], src_uv[0], src_uv[1],
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255;
src_y += 2;
usrc_v += 2;
src_uv += 2;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
YuvPixel(src_y[0], src_uv[0], src_uv[1],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255;
}
@ -1402,7 +1426,7 @@ void NV21ToARGBRow_C(const uint8* src_y,
}
void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* usrc_v,
const uint8* src_uv,
uint8* dst_rgb565,
int width) {
uint8 b0;
@ -1413,8 +1437,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
uint8 r1;
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@ -1424,11 +1448,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
*(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
(b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2;
usrc_v += 2;
src_uv += 2;
dst_rgb565 += 4; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
@ -1588,7 +1612,7 @@ void I422ToRGBARow_C(const uint8* src_y,
}
}
void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@ -2062,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
}
}
// Select G channel from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
// Copy a row of G.
int x;
for (x = 0; x < pix - 1; x += 2) {
dst_bayer[0] = src_argb[1];
dst_bayer[1] = src_argb[5];
src_argb += 8;
dst_bayer += 2;
}
if (pix & 1) {
dst_bayer[0] = src_argb[1];
}
}
// Use first 4 shuffler values to reorder ARGB channels.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
@ -2120,7 +2128,7 @@ void I422ToYUY2Row_C(const uint8* src_y,
if (width & 1) {
dst_frame[0] = src_y[0];
dst_frame[1] = src_u[0];
dst_frame[2] = src_y[0]; // duplicate last y
dst_frame[2] = 0;
dst_frame[3] = src_v[0];
}
}
@ -2144,14 +2152,15 @@ void I422ToUYVYRow_C(const uint8* src_y,
dst_frame[0] = src_u[0];
dst_frame[1] = src_y[0];
dst_frame[2] = src_v[0];
dst_frame[3] = src_y[0]; // duplicate last y
dst_frame[3] = 0;
}
}
// Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048
#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
#if !(defined(_MSC_VER) && !defined(__clang__)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
@ -2346,6 +2355,50 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
}
#endif
#if defined(HAS_I422TORGB24ROW_AVX2)
void I422ToRGB24Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TORAWROW_AVX2)
void I422ToRAWRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_raw,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
// TODO(fbarchard): ARGBToRAWRow_AVX2
ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_raw += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_NV12TORGB565ROW_AVX2)
void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
uint8* dst_rgb565, int width) {

View File

@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
}
#endif // TESTING
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
#ifdef HAS_J400TOARGBROW_SSE2
void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
);
}
#endif // HAS_I400TOARGBROW_SSE2
#endif // HAS_J400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3
// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
// YUV to RGB conversion constants.
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR (VR * 128 - YGB)
struct YuvConstants {
lvec8 kUVToB; // 0
lvec8 kUVToG; // 32
@ -1440,6 +1423,27 @@ struct YuvConstants {
lvec16 kYToRgb; // 192
};
// BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.018 * 64)) */
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// BT601 constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
};
#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y - U * -1.77200
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YGBJ 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UBJ -113 /* round(-1.77200 * 64) */
#define UGJ 22 /* round(0.34414 * 64) */
#define VGJ 46 /* round(0.71414 * 64) */
#define VRJ -90 /* round(-1.40200 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BBJ (UBJ * 128 + YGBJ)
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
#define BRJ (VRJ * 128 + YGBJ)
// JPEG constants for YUV to RGB.
YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
{ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
{ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
{ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
{ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
{ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
{ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
{ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
};
#undef YGJ
#undef YGBJ
#undef UBJ
#undef UGJ
#undef VGJ
#undef VRJ
#undef BBJ
#undef BGJ
#undef BRJ
// Read 8 UV from 411
#define READYUV444 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm2,%%xmm0 \n" \
"punpckhwd %%xmm2,%%xmm1 \n" \
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
"lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
// Store 8 BGRA values. Assumes XMM5 is zero.
#define STOREBGRA \
@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm1,%%xmm5 \n" \
"punpckhwd %%xmm1,%%xmm0 \n" \
"movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
"lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
// Store 8 ABGR values. Assumes XMM5 is zero.
#define STOREABGR \
@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm0,%%xmm2 \n" \
"punpckhwd %%xmm0,%%xmm1 \n" \
"movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
"lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
// Store 8 RGBA values. Assumes XMM5 is zero.
#define STORERGBA \
@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm1,%%xmm5 \n" \
"punpckhwd %%xmm1,%%xmm0 \n" \
"movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
"lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
);
}
void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422
YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
"vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
"vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \
"vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \
"vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \
"vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
"vpermq $0xd8,%%ymm3,%%ymm3 \n" \
@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
}
#endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_J422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants)
// Step 3: Weave into ARGB
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_J422TOARGBROW_AVX2
#if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
}
#endif // HAS_I422TORGBAROW_AVX2
#ifdef HAS_YTOARGBROW_SSE2
void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
"movd %%eax,%%xmm2 \n"
@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
#endif // HAS_YTOARGBROW_SSE2
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_AVX2
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile (
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
"vmovd %%eax,%%xmm2 \n"
@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
);
}
#endif // HAS_YTOARGBROW_AVX2
#endif // HAS_I400TOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"sub $0x1,%3 \n"
"je 91f \n"
"jl 99f \n"
// 1 pixel loop until destination pointer is aligned.
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
"movd " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd " MEMACCESS(1) ",%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
"add $1-4,%3 \n"
"sub $0x4,%3 \n"
"jl 49f \n"
// 4 pixel loop.
@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"sub $0x1,%3 \n"
"je 91f \n"
"jl 99f \n"
// 1 pixel loop until destination pointer is aligned.
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
"movd " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
"add $1-4,%3 \n"
"sub $0x4,%3 \n"
"jl 49f \n"
// 4 pixel loop.
@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
#ifdef HAS_ARGBTOBAYERGGROW_SSE2
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
);
}
#endif // HAS_ARGBTOBAYERGGROW_SSE2
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

View File

@ -94,11 +94,17 @@ extern "C" {
"vtrn.u32 d2, d3 \n"
#define YUV422TORGB_SETUP_REG \
MEMACCESS([kUVToRB]) \
"vld1.8 {d24}, [%[kUVToRB]] \n" \
MEMACCESS([kUVToG]) \
"vld1.8 {d25}, [%[kUVToG]] \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
MEMACCESS([kYToRgb]) \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
#define YUV422TORGB \
@ -186,7 +192,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -216,7 +222,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -246,7 +252,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -277,7 +283,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -308,7 +314,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -338,7 +344,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -367,7 +373,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -397,7 +403,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -439,7 +445,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -485,7 +491,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -526,14 +532,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
YUV422TORGB_SETUP_REG
".p2align 2 \n"
@ -552,17 +558,17 @@ void YToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void I400ToARGBRow_NEON(const uint8* src_y,
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n"
@ -603,7 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -631,7 +637,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -659,7 +665,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -687,7 +693,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -713,7 +719,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -739,7 +745,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
[kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3",
: "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
@ -1245,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
);
}
// Select G channels from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
@ -1360,6 +1347,30 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
);
}
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
".p2align 2 \n"
"vdup.32 d2, %2 \n" // dither4
"1: \n"
MEMACCESS(1)
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n"
ARGBTORGB565
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
);
}
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) {
asm volatile (

View File

@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"1: \n"
READYUV444
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"1: \n"
READYUV411
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v21, v22, v23)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v23, v22, v21)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n"
@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n"
@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
ARGBTORGB565
MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB1555
MEMACCESS(3)
@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"1: \n"
READYUV422
YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
MEMACCESS(3)
@ -513,33 +513,34 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
}
#endif // HAS_I422TOARGB4444ROW_NEON
#ifdef HAS_YTOARGBROW_NEON
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
int64 width64 = (int64)(width);
asm volatile (
YUV422TORGB_SETUP_REG
"1: \n"
READYUV400
YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
"subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
"+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
#endif // HAS_YTOARGBROW_NEON
#endif // HAS_I400TOARGBROW_NEON
#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
#ifdef HAS_J400TOARGBROW_NEON
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
@ -549,7 +550,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
"ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n"
"subs %2, %2, #8 \n"
"subs %w2, %w2, #8 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
@ -560,7 +561,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
: "cc", "memory", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_I400TOARGBROW_NEON
#endif // HAS_J400TOARGBROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y,
@ -572,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"1: \n"
READNV12
YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
"subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
@ -599,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"1: \n"
READNV21
YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
"subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
@ -626,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"1: \n"
READNV12
YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
"subs %w3, %w3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
@ -653,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"1: \n"
READNV21
YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n"
"subs %w3, %w3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
@ -674,19 +675,20 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
int64 width64 = (int64)(width);
asm volatile (
YUV422TORGB_SETUP_REG
"1: \n"
READYUY2
YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
"subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
"+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@ -699,19 +701,20 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
int64 width64 = (int64)(width);
asm volatile (
YUV422TORGB_SETUP_REG
"1: \n"
READUYVY
YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n"
"subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
"+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@ -728,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
"subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2)
@ -754,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
"subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(2)
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
@ -776,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
"subs %w2, %w2, #32 \n" // 32 processed per loop
MEMACCESS(1)
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
"b.gt 1b \n"
@ -794,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
"subs %w1, %w1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
@ -809,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #4 \n" // 4 ints per loop
"subs %w1, %w1, #4 \n" // 4 ints per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n"
@ -822,6 +825,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
#ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
"add %0, %0, %2 \n"
@ -830,7 +834,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, %2, #16 \n" // 16 pixels per loop.
"subs %2, %2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@ -839,7 +843,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
"+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
@ -849,6 +853,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORUVROW_NEON
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
"add %0, %0, %3, lsl #1 \n"
@ -868,7 +873,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
"+r"(width64) // %3
: "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "v0", "v1"
);
@ -877,6 +882,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#ifdef HAS_ARGBMIRRORROW_NEON
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
"add %0, %0, %2, lsl #2 \n"
@ -894,7 +900,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
"+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
@ -908,7 +914,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"1: \n"
MEMACCESS(0)
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n"
@ -928,7 +934,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
MEMACCESS(1)
@ -963,7 +969,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@ -1022,7 +1028,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@ -1055,7 +1061,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@ -1075,7 +1081,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1)
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"b.gt 1b \n"
@ -1094,7 +1100,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
"orr v5.8b, v1.8b, v1.8b \n" // mov b
MEMACCESS(1)
@ -1115,7 +1121,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
@ -1134,7 +1140,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n"
@ -1154,7 +1160,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
@ -1177,7 +1183,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2)
@ -1201,7 +1207,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
@ -1231,7 +1237,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
@ -1253,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
}
#endif // HAS_UYVYTOUVROW_NEON
// Select G channels from ARGB. e.g. GGGGGGGG
#ifdef HAS_ARGBTOBAYERGGROW_NEON
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERGGROW_NEON
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
#ifdef HAS_ARGBSHUFFLEROW_NEON
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
@ -1284,7 +1269,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 4.
@ -1312,7 +1297,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
@ -1341,7 +1326,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2)
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
@ -1362,7 +1347,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
@ -1376,6 +1361,31 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
}
#endif // HAS_ARGBTORGB565ROW_NEON
#ifdef HAS_ARGBTORGB565DITHERROW_NEON
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
"dup v1.4s, %w2 \n" // dither4
"1: \n"
MEMACCESS(1)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n"
ARGBTORGB565
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
: "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_ARGBTORGB565ROW_NEON
#ifdef HAS_ARGBTOARGB1555ROW_NEON
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) {
@ -1383,7 +1393,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
"1: \n"
MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
@ -1405,7 +1415,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"1: \n"
MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
@ -1429,7 +1439,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
@ -1456,7 +1466,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R
@ -1487,7 +1497,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
@ -1531,7 +1541,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"subs %3, %3, #16 \n" // 16 processed per loop.
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R
@ -1587,7 +1597,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %3, %3, #32 \n" // 32 processed per loop.
"subs %w3, %w3, #32 \n" // 32 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R
@ -1653,7 +1663,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1700,7 +1710,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1741,7 +1751,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"urshr v1.8h, v3.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1782,7 +1792,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"urshr v2.8h, v2.8h, #1 \n"
"urshr v1.8h, v1.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1823,7 +1833,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1864,7 +1874,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1905,7 +1915,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v0.8h, v0.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@ -1971,7 +1981,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"urshr v5.8h, v18.8h, #1 \n"
"urshr v6.8h, v20.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v16.8h, v4.8h, v22.8h \n" // B
"mls v16.8h, v5.8h, v23.8h \n" // G
"mls v16.8h, v6.8h, v24.8h \n" // R
@ -2042,7 +2052,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R
@ -2113,7 +2123,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R
@ -2153,7 +2163,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G
@ -2183,7 +2193,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
"umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G
@ -2212,7 +2222,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G
@ -2241,7 +2251,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B
@ -2269,7 +2279,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B
@ -2297,7 +2307,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R
@ -2325,7 +2335,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
@ -2353,7 +2363,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"1: \n"
MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
@ -2380,13 +2390,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile (
"cmp %4, #0 \n"
"cmp %w4, #0 \n"
"b.eq 100f \n"
"cmp %4, #64 \n"
"cmp %w4, #64 \n"
"b.eq 75f \n"
"cmp %4, #128 \n"
"cmp %w4, #128 \n"
"b.eq 50f \n"
"cmp %4, #192 \n"
"cmp %w4, #192 \n"
"b.eq 25f \n"
"dup v5.16b, %w4 \n"
@ -2397,7 +2407,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"umull v2.8h, v0.8b, v4.8b \n"
"umull2 v3.8h, v0.16b, v4.16b \n"
"umlal v2.8h, v1.8b, v5.8b \n"
@ -2415,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
@ -2429,7 +2439,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
@ -2442,7 +2452,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
@ -2454,7 +2464,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
@ -2477,7 +2487,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"subs %3, %3, #8 \n"
"subs %w3, %w3, #8 \n"
"b.lt 89f \n"
// Blend 8 pixels.
"8: \n"
@ -2485,7 +2495,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
@ -2504,7 +2514,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"b.ge 8b \n"
"89: \n"
"adds %3, %3, #8-1 \n"
"adds %w3, %w3, #8-1 \n"
"b.lt 99f \n"
// Blend 1 pixels.
@ -2513,7 +2523,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1)
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"subs %w3, %w3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
@ -2552,7 +2562,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a
@ -2586,7 +2596,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n"
@ -2630,7 +2640,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"1: \n"
MEMACCESS(0)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n"
"uxtl v6.8h, v6.8b \n"
@ -2667,7 +2677,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R
@ -2706,7 +2716,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
@ -2746,7 +2756,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n"
MEMACCESS(0)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g
"uxtl v18.8h, v18.8b \n" // r
@ -2808,7 +2818,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
"umull v2.8h, v2.8b, v6.8b \n" // multiply R
@ -2842,7 +2852,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n"
@ -2872,7 +2882,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n"
@ -2907,7 +2917,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
@ -2935,7 +2945,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
@ -2966,7 +2976,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
@ -3006,7 +3016,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2)
"ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"subs %w4, %w4, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n"
@ -3019,8 +3029,8 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(src_y2), // %2
"+r"(dst_sobelx), // %3
"+r"(width) // %4
: "r"(2), // %5
"r"(6) // %6
: "r"(2LL), // %5
"r"(6LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
@ -3051,7 +3061,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1)
"ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"subs %w3, %w3, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n"
@ -3063,8 +3073,8 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(src_y1), // %1
"+r"(dst_sobely), // %2
"+r"(width) // %3
: "r"(1), // %4
"r"(6) // %5
: "r"(1LL), // %4
"r"(6LL) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}

File diff suppressed because it is too large Load Diff

View File

@ -23,9 +23,6 @@ namespace libyuv {
extern "C" {
#endif
// Remove this macro if OVERREAD is safe.
#define AVOID_OVERREAD 1
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
@ -44,9 +41,8 @@ static void ScalePlaneDown2(int src_width, int src_height,
int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
filtering == kFilterNone ? ScaleRowDown2_C :
(filtering == kFilterLinear ? ScaleRowDown2Linear_C :
ScaleRowDown2Box_C);
filtering == kFilterNone ? ScaleRowDown2_C :
(filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
@ -54,15 +50,39 @@ static void ScalePlaneDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
ScaleRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
(filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
ScaleRowDown2Box_NEON);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
ScaleRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
ScaleRowDown2Box_Any_AVX2);
if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
ScaleRowDown2Box_AVX2);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@ -154,13 +174,30 @@ static void ScalePlaneDown4(int src_width, int src_height,
src_stride = 0;
}
#if defined(HAS_SCALEROWDOWN4_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
@ -249,24 +286,42 @@ static void ScalePlaneDown34(int src_width, int src_height,
ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
}
#if defined(HAS_SCALEROWDOWN34_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
if (TestCpuFlag(kCpuHasNEON)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
}
if (dst_width % 24 == 0) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
}
}
}
#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
}
if (dst_width % 24 == 0) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
}
}
}
#endif
@ -422,23 +477,41 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
}
#if defined(HAS_SCALEROWDOWN38_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
if (TestCpuFlag(kCpuHasNEON)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
}
if (dst_width % 12 == 0) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
}
}
}
#endif
#if defined(HAS_SCALEROWDOWN38_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
}
if (dst_width % 12 == 0 && !filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
} else {
}
if (dst_width % 6 == 0 && filtering) {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
}
@ -559,65 +632,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
}
}
static __inline uint32 SumBox(int iboxwidth, int iboxheight,
ptrdiff_t src_stride, const uint8* src_ptr) {
uint32 sum = 0u;
int y;
assert(iboxwidth > 0);
assert(iboxheight > 0);
for (y = 0; y < iboxheight; ++y) {
int x;
for (x = 0; x < iboxwidth; ++x) {
sum += src_ptr[x];
}
src_ptr += src_stride;
}
return sum;
}
static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
ptrdiff_t src_stride, const uint16* src_ptr) {
uint32 sum = 0u;
int y;
assert(iboxwidth > 0);
assert(iboxheight > 0);
for (y = 0; y < iboxheight; ++y) {
int x;
for (x = 0; x < iboxwidth; ++x) {
sum += src_ptr[x];
}
src_ptr += src_stride;
}
return sum;
}
static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
int x, int dx, ptrdiff_t src_stride,
const uint8* src_ptr, uint8* dst_ptr) {
int i;
int boxwidth;
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
*dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
(boxwidth * boxheight);
}
}
static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
int x, int dx, ptrdiff_t src_stride,
const uint16* src_ptr, uint16* dst_ptr) {
int i;
int boxwidth;
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
*dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
(boxwidth * boxheight);
}
}
#define MIN1(x) ((x) < 1 ? 1 : (x))
static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
uint32 sum = 0u;
@ -643,15 +658,15 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = (dx >> 16);
int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (minboxwidth * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
@ -660,25 +675,36 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = (dx >> 16);
int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth;
int boxwidth;
scaletbl[0] = 65536 / (minboxwidth * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
scaleptr[boxwidth] >> 16;
boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ =
SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
const uint16* src_ptr, uint8* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
}
}
static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) {
int boxwidth = (dx >> 16);
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
x >>= 16;
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth;
@ -687,7 +713,7 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) {
int boxwidth = (dx >> 16);
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
for (i = 0; i < dst_width; ++i) {
@ -707,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
int j;
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@ -717,10 +743,40 @@ static void ScalePlaneBox(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
uint8* dst = dst_ptr;
int j;
{
// Allocate a row buffer of uint16.
align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C:
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
ScaleAddRow_C;
#if defined(HAS_SCALEADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleAddRow = ScaleAddRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_SSE2;
}
}
#endif
#if defined(HAS_SCALEADDROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleAddRow = ScaleAddRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
ScaleAddRow = ScaleAddRow_AVX2;
}
}
#endif
#if defined(HAS_SCALEADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleAddRow = ScaleAddRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_NEON;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
@ -729,46 +785,13 @@ static void ScalePlaneBox(int src_width, int src_height,
if (y > max_y) {
y = max_y;
}
boxheight = (y >> 16) - iy;
ScalePlaneBoxRow_C(dst_width, boxheight,
x, dx, src_stride,
src, dst);
dst += dst_stride;
}
return;
}
{
// Allocate a row buffer of uint16.
align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
&& IS_ALIGNED(src_width, 16)
#endif
) {
ScaleAddRows = ScaleAddRows_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
const uint8* src = src_ptr + iy * src_stride;
y += dy;
if (y > (src_height << 16)) {
y = (src_height << 16);
boxheight = MIN1((y >> 16) - iy);
memset(row16, 0, src_width * 2);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint16 *)(row16), src_width);
src += src_stride;
}
boxheight = (y >> 16) - iy;
ScaleAddRows(src, src_stride, (uint16*)(row16),
src_width, boxheight);
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
dst_ptr);
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
dst_ptr += dst_stride;
}
free_aligned_buffer_64(row16);
@ -779,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint16* src_ptr, uint16* dst_ptr) {
int j;
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@ -789,10 +812,21 @@ static void ScalePlaneBox_16(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1.
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {
uint16* dst = dst_ptr;
int j;
{
// Allocate a row buffer of uint32.
align_buffer_64(row32, src_width * 4);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
ScaleAddRow_16_C;
#if defined(HAS_SCALEADDROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_16_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
@ -801,46 +835,13 @@ static void ScalePlaneBox_16(int src_width, int src_height,
if (y > max_y) {
y = max_y;
}
boxheight = (y >> 16) - iy;
ScalePlaneBoxRow_16_C(dst_width, boxheight,
x, dx, src_stride,
src, dst);
dst += dst_stride;
}
return;
}
{
// Allocate a row buffer of uint32.
align_buffer_64(row32, src_width * 4);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
&& IS_ALIGNED(src_width, 16)
#endif
) {
ScaleAddRows = ScaleAddRows_16_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
const uint16* src = src_ptr + iy * src_stride;
y += dy;
if (y > (src_height << 16)) {
y = (src_height << 16);
boxheight = MIN1((y >> 16) - iy);
memset(row32, 0, src_width * 4);
for (k = 0; k < boxheight; ++k) {
ScaleAddRow(src, (uint32 *)(row32), src_width);
src += src_stride;
}
boxheight = (y >> 16) - iy;
ScaleAddRows(src, src_stride, (uint32*)(row32),
src_width, boxheight);
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
dst_ptr);
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
dst_ptr += dst_stride;
}
free_aligned_buffer_64(row32);
@ -920,6 +921,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif
if (y > max_y) {
y = max_y;
@ -1057,8 +1066,8 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C;
void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C;
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
@ -1111,6 +1120,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
@ -1129,7 +1146,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
const uint8* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 15) & ~15;
const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row;
@ -1188,8 +1205,8 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_16_C;
void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy);
src_width = Abs(src_width);
@ -1260,7 +1277,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
const uint16* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 15) & ~15;
const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 4);
uint16* rowptr = (uint16*)row;
@ -1334,8 +1351,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
}
for (i = 0; i < dst_height; ++i) {
ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
dst_width, x, dx);
ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
}
@ -1385,8 +1401,7 @@ void ScalePlane(const uint8* src, int src_stride,
enum FilterMode filtering) {
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height,
filtering);
dst_width, dst_height, filtering);
// Negative height means invert the image.
if (src_height < 0) {
@ -1402,9 +1417,9 @@ void ScalePlane(const uint8* src, int src_stride,
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
return;
}
if (dst_width == src_width) {
if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically.
// Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height,
dst_width, dst_height,
src_stride, dst_stride, src, dst,
@ -1435,7 +1450,7 @@ void ScalePlane(const uint8* src, int src_stride,
return;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
filtering != kFilterBilinear) {
(filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@ -1469,8 +1484,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
enum FilterMode filtering) {
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height,
filtering);
dst_width, dst_height, filtering);
// Negative height means invert the image.
if (src_height < 0) {
@ -1563,6 +1577,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -1594,6 +1609,7 @@ int I420Scale_16(const uint16* src_y, int src_stride_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1;
}

200
third_party/libyuv/source/scale_any.cc vendored Normal file
View File

@ -0,0 +1,200 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
#endif
#undef CANY
// Fixed scale down.
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSE2
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_SSE2
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
4, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C, 2, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C, 2, 4, 7)
#endif
#undef SDANY
// Scale down by even scale factor.
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
src_stepx, dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_AVX2
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#endif
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#undef SAANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@ -53,16 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height,
}
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
ScaleARGBRowDown2_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
}
}
#endif
@ -86,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
int x, int dx, int y, int dy) {
int j;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@ -96,15 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
}
#endif
for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@ -135,15 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
}
#endif
@ -229,6 +255,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
@ -321,10 +355,26 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@ -344,7 +394,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
const uint8* src = src_argb + yi * src_stride;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 15) & ~15;
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row;
@ -495,10 +545,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@ -521,7 +587,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_row_v = src_v + uv_yi * src_stride_v;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 15) & ~15;
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
// Allocate 1 row of ARGB for source conversion.
@ -606,6 +672,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2;
}
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBCols_NEON;
}
}
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
@ -744,6 +818,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
@ -762,6 +837,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
int dst_width, int dst_height,
enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}

View File

@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
}
}
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
assert(src_height > 0);
for (x = 0; x < src_width; ++x) {
const uint8* s = src_ptr + x;
unsigned int sum = 0u;
int y;
for (y = 0; y < src_height; ++y) {
sum += s[0];
s += src_stride;
}
// TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
dst_ptr[x] = sum < 65535u ? sum : 65535u;
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
}
}
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height) {
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
assert(src_height > 0);
for (x = 0; x < src_width; ++x) {
const uint16* s = src_ptr + x;
unsigned int sum = 0u;
int y;
for (y = 0; y < src_height; ++y) {
sum += s[0];
s += src_stride;
}
// No risk of overflow here now
dst_ptr[x] = sum;
for (x = 0; x < src_width - 1; x += 2) {
dst_ptr[0] += src_ptr[0];
dst_ptr[1] += src_ptr[1];
src_ptr += 2;
dst_ptr += 2;
}
if (src_width & 1) {
dst_ptr[0] += src_ptr[0];
}
}
@ -1030,10 +1022,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
filtering = kFilterBilinear;
}
// If scaling to larger, switch from Box to Bilinear.
if (dst_width >= src_width || dst_height >= src_height) {
filtering = kFilterBilinear;
}
}
if (filtering == kFilterBilinear) {
if (src_height == 1) {

View File

@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
);
}
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0;
intptr_t tmp_src = 0;
asm volatile (
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"mov %0,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n"
"test %2,%2 \n"
"je 3f \n"
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqu " MEMACCESS(3) ",%%xmm2 \n"
"add %6,%3 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n"
"jg 2b \n"
"jg 1b \n"
LABELALIGN
"3: \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
int src_stepx, uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0;
asm volatile (

View File

@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
"subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
@ -517,6 +541,112 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL;
asm volatile (
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
"veor q2, q2, q2 \n"
"veor q3, q3, q3 \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n"
"bgt 2b \n"
MEMACCESS(2)
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
:
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q3, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"vmov q10, q1 \n"
"vmov q11, q2 \n"
"vuzp.16 q10, q11 \n"
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vsubl.s16 q11, d18, d16 \n"
"vsubl.s16 q12, d19, d17 \n"
"vmovl.u16 q13, d20 \n"
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
"vshrn.s32 d18, q11, #16 \n"
"vshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
@ -640,6 +770,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
@ -757,6 +916,119 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {"#dn"["#n"]}, [%6] \n"
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int tmp = 0;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"1: \n"
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
MEMACCESS(0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
);
}
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q9, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
"vmov.i8 q3, #0x7f \n" // 0x7F
"vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
LOAD2_DATA32_LANE(d0, d2, 1)
LOAD2_DATA32_LANE(d1, d3, 0)
LOAD2_DATA32_LANE(d1, d3, 1)
"vshrn.i32 d22, q8, #9 \n"
"vand.16 d22, d22, d30 \n"
"vdup.8 d24, d22[0] \n"
"vdup.8 d25, d22[2] \n"
"vdup.8 d26, d22[4] \n"
"vdup.8 d27, d22[6] \n"
"vext.8 d4, d24, d25, #4 \n"
"vext.8 d5, d26, d27, #4 \n" // f
"veor.8 q10, q2, q3 \n" // 0x7f ^ f
"vmull.u8 q11, d0, d20 \n"
"vmull.u8 q12, d1, d21 \n"
"vmull.u8 q13, d2, d4 \n"
"vmull.u8 q14, d3, d5 \n"
"vadd.i16 q11, q11, q13 \n"
"vadd.i16 q12, q12, q14 \n"
"vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#undef LOAD2_DATA32_LANE
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus

View File

@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n"
// load even pixels into v0, odd into v1
MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #16 \n" // 16 processed per loop
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n"
@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "v0", "v1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
"subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n"
@ -103,7 +126,7 @@ asm volatile (
"ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n"
"subs %5, %5, #4 \n"
"subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n"
"uadalp v0.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n"
@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n"
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
"subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1
// expand chars to shorts to allow for room
@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n"
"subs %w2, %w2, #24 \n"
// average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n"
"urhadd v1.8b, v1.8b, v5.8b \n"
@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #12 \n"
"subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n"
@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %4, %4, #12 \n"
"subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %3, %3, #12 \n"
"subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
);
}
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL;
asm volatile (
"1: \n"
"mov %0, %1 \n"
"mov w12, %w5 \n"
"eor v2.16b, v2.16b, v2.16b \n"
"eor v3.16b, v3.16b, v3.16b \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n"
"b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n"
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
:
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}["#n"], [%6] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"add v2.4s, v1.4s, v3.4s \n"
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"mov v6.16b, v1.16b \n"
"mov v7.16b, v2.16b \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"ushll v4.8h, v4.8b, #0 \n"
"ushll v5.8h, v5.8b, #0 \n"
"ssubl v16.4s, v5.4h, v4.4h \n"
"ssubl2 v17.4s, v5.8h, v4.8h \n"
"ushll v7.4s, v6.4h, #0 \n"
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"shrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3",
"v4", "v5", "v6", "v7", "v16", "v17"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
asm volatile (
"cmp %4, #0 \n"
"cmp %w4, #0 \n"
"b.eq 100f \n"
"add %2, %2, %1 \n"
"cmp %4, #64 \n"
"cmp %w4, #64 \n"
"b.eq 75f \n"
"cmp %4, #128 \n"
"cmp %w4, #128 \n"
"b.eq 50f \n"
"cmp %4, #192 \n"
"cmp %w4, #192 \n"
"b.eq 25f \n"
"dup v5.8b, %w4 \n"
@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n"
"umull2 v7.8h, v0.16b, v4.16b \n"
"umlal v6.8h, v1.8b, v5.8b \n"
@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"100: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n"
"subs %w3, %w3, #16 \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n"
@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1)
@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
);
}
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS (0)
// load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n"
MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
: "r"((int64)(src_stepx * 4)) // %3
: "memory", "cc", "v0"
);
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
// TODO, might be worth another optimization pass in future.
// TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n"
"ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n"
"ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n"
"ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n"
"ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n"
"ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n"
"ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n"
"uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n"
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
"mov v0.d[1], v2.d[0] \n"
"mov v2.d[0], v16.d[1] \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
"mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
"ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n"
"uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n"
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
"mov v0.d[1], v2.d[0] \n"
"mov v2.d[0], v16.d[1] \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
"mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n"
@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"+r"(src_stride), // %1
"+r"(dst_argb), // %2
"+r"(dst_width) // %3
: "r"(src_stepx * 4) // %4
: "r"((int64)(src_stepx * 4)) // %4
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {"#vn".s}["#n"], [%6] \n"
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 tmp64 = 0;
asm volatile (
"1: \n"
LOAD1_DATA32_LANE(v0, 0)
LOAD1_DATA32_LANE(v0, 1)
LOAD1_DATA32_LANE(v0, 2)
LOAD1_DATA32_LANE(v0, 3)
LOAD1_DATA32_LANE(v1, 0)
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp64), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1"
);
}
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v6.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
"movi v3.16b, #0x7f \n" // 0x7F
"movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v5.4s, v1.4s, v0.4s \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
LOAD2_DATA32_LANE(v0, v1, 1)
LOAD2_DATA32_LANE(v0, v1, 2)
LOAD2_DATA32_LANE(v0, v1, 3)
"shrn v2.4h, v5.4s, #9 \n"
"and v2.8b, v2.8b, v4.8b \n"
"dup v16.8b, v2.b[0] \n"
"dup v17.8b, v2.b[2] \n"
"dup v18.8b, v2.b[4] \n"
"dup v19.8b, v2.b[6] \n"
"ext v2.8b, v16.8b, v17.8b, #4 \n"
"ext v17.8b, v18.8b, v19.8b, #4 \n"
"ins v2.d[1], v17.d[0] \n" // f
"eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
"umull v16.8h, v0.8b, v7.8b \n"
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v16", "v17", "v18", "v19"
);
}
#undef LOAD2_DATA32_LANE
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
@ -16,7 +17,8 @@ extern "C" {
#endif
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9
static uvec8 kShuf0 =
@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
// Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x1 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
#ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
__declspec(naked)
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x1 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x2 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
pop esi
vzeroupper
ret
}
}
#endif // HAS_SCALEROWDOWN2_AVX2
// Point samples 32 pixels to 8 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm7, 8
wloop:
movdqu xmm0, [eax]
movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16]
@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
}
}
#ifdef HAS_SCALEROWDOWN4_AVX2
// Point samples 64 pixels to 16 pixels.
__declspec(naked)
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
vpsrld ymm5, ymm5, 24
vpslld ymm5, ymm5, 16
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpand ymm0, ymm0, ymm5
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
vzeroupper
ret
}
}
// Blends 64x4 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_ptr
mov esi, [esp + 8 + 8] // src_stride
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
vpsrlw ymm7, ymm7, 8
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
vpavgb ymm2, ymm2, [eax + edi]
vpavgb ymm3, ymm3, [eax + edi + 32]
lea eax, [eax + 64]
vpavgb ymm0, ymm0, ymm2
vpavgb ymm1, ymm1, ymm3
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
vpand ymm3, ymm1, ymm7
vpsrlw ymm0, ymm0, 8
vpsrlw ymm1, ymm1, 8
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
vpsrlw ymm0, ymm0, 8
vpavgw ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_SCALEROWDOWN4_AVX2
// Point samples 32 pixels to 24 pixels.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// xmm7 kRound34
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
}
// Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
// 3/8 point sampler
// Scale 32 pixels to 12
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
}
// Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
}
// Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
}
}
// Reads 16xN bytes and produces 16 shorts at a time.
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
__declspec(naked) __declspec(align(16))
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
// Reads 16 bytes and accumulates to 16 shorts at a time.
__declspec(naked)
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
__asm {
push esi
push edi
push ebx
push ebp
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4
dec ebx
mov eax, [esp + 4] // src_ptr
mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
// sum rows
xloop:
// first row
movdqu xmm0, [esi]
lea eax, [esi + edx]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm4
punpckhbw xmm1, xmm4
lea esi, [esi + 16]
mov ebp, ebx
test ebp, ebp
je ydone
// sum remaining rows
yloop:
movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
punpckhbw xmm3, xmm4
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
movdqu xmm0, [edx] // read 16 words from destination
movdqu xmm1, [edx + 16]
movdqa xmm2, xmm3
punpcklbw xmm2, xmm5
punpckhbw xmm3, xmm5
paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3
sub ebp, 1
jg yloop
ydone:
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32]
movdqu [edx], xmm0 // write 16 words to destination
movdqu [edx + 16], xmm1
lea edx, [edx + 32]
sub ecx, 16
jg xloop
pop ebp
pop ebx
pop edi
pop esi
ret
}
}
// Bilinear column filtering. SSSE3 version.
// TODO(fbarchard): Port to Neon
// TODO(fbarchard): Switch the following:
// xor ebx, ebx
// mov bx, word ptr [esi + eax] // 2 source x0 pixels
// To
// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
// when drmemory bug fixed.
// https://code.google.com/p/drmemory/issues/detail?id=1396
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
__declspec(naked)
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
__asm {
mov eax, [esp + 4] // src_ptr
mov edx, [esp + 8] // dst_ptr
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
__declspec(naked) __declspec(align(16))
// sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5
vpaddusw ymm0, ymm2, [edx] // sum 16 words
vpaddusw ymm1, ymm3, [edx + 32]
vmovdqu [edx], ymm0 // write 32 words to destination
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 32
jg xloop
vzeroupper
ret
}
}
#endif // HAS_SCALEADDROW_AVX2
// Bilinear column filtering. SSSE3 version.
__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
__asm {
@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 16 pixels, duplicates them and writes 32 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
__asm {
@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
}
// Blends 8x1 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
}
// Blends 8x2 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
}
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width) {
@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
}
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
}
// Column scaling unfiltered. SSE2 version.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
__declspec(naked) __declspec(align(16))
__declspec(naked)
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
__asm {
@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
}
// Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16))
__declspec(naked)
int FixedDiv_X86(int num, int div) {
__asm {
mov eax, [esp + 4] // num
@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
}
// Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16))
__declspec(naked)
int FixedDiv1_X86(int num, int div) {
__asm {
mov eax, [esp + 4] // num
@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
ret
}
}
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"