update libyuv to r1456

picks up build warning fixes for visual studio 2015

Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520
This commit is contained in:
James Zern
2015-07-24 16:54:51 -07:00
parent f42012e526
commit fcb4253c9c
46 changed files with 5400 additions and 2955 deletions

View File

@@ -22,17 +22,18 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/source/planar_functions.cc \ third_party/libyuv/source/planar_functions.cc \
third_party/libyuv/source/row_any.cc \ third_party/libyuv/source/row_any.cc \
third_party/libyuv/source/row_common.cc \ third_party/libyuv/source/row_common.cc \
third_party/libyuv/source/row_gcc.cc \
third_party/libyuv/source/row_mips.cc \ third_party/libyuv/source/row_mips.cc \
third_party/libyuv/source/row_neon.cc \ third_party/libyuv/source/row_neon.cc \
third_party/libyuv/source/row_neon64.cc \ third_party/libyuv/source/row_neon64.cc \
third_party/libyuv/source/row_posix.cc \
third_party/libyuv/source/row_win.cc \ third_party/libyuv/source/row_win.cc \
third_party/libyuv/source/scale.cc \ third_party/libyuv/source/scale.cc \
third_party/libyuv/source/scale_any.cc \
third_party/libyuv/source/scale_common.cc \ third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_gcc.cc \
third_party/libyuv/source/scale_mips.cc \ third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \ third_party/libyuv/source/scale_neon.cc \
third_party/libyuv/source/scale_neon64.cc \ third_party/libyuv/source/scale_neon64.cc \
third_party/libyuv/source/scale_posix.cc \
third_party/libyuv/source/scale_win.cc \ third_party/libyuv/source/scale_win.cc \
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \

View File

@@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1305 Version: 1456
License: BSD License: BSD
License File: LICENSE License File: LICENSE
@@ -13,4 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams. in order to encode multiple resolution bit streams.
Local Modifications: Local Modifications:
cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'

View File

@@ -71,6 +71,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
#define J400ToJ420 I400ToI420
// Convert NV12 to I420. // Convert NV12 to I420.
LIBYUV_API LIBYUV_API
int NV12ToI420(const uint8* src_y, int src_stride_y, int NV12ToI420(const uint8* src_y, int src_stride_y,

View File

@@ -68,20 +68,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Convert I400 (grey) to ARGB. // Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y, int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alias. // Convert J400 (jpeg grey) to ARGB.
#define YToARGB I400ToARGB_Reference
// Convert I400 to ARGB. Reverse of ARGBToI400.
LIBYUV_API LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height); int width, int height);
// Alias.
#define YToARGB I400ToARGB
// Convert NV12 to ARGB. // Convert NV12 to ARGB.
LIBYUV_API LIBYUV_API

View File

@@ -137,6 +137,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
uint8* dst_frame, int dst_stride_frame, uint8* dst_frame, int dst_stride_frame,
int width, int height); int width, int height);
// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_frame, int dst_stride_frame,
const uint8* dither4x4, int width, int height);
LIBYUV_API LIBYUV_API
int I420ToARGB1555(const uint8* src_y, int src_stride_y, int I420ToARGB1555(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u, const uint8* src_u, int src_stride_u,

View File

@@ -61,12 +61,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
int width, int height); int width, int height);
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
// Values in dither matrix from 0 to 255. 128 is best for no dither. // Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
// const uint8(*dither)[4][4];
LIBYUV_API LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height); const uint8* dither4x4, int width, int height);
// Convert ARGB To ARGB1555. // Convert ARGB To ARGB1555.
LIBYUV_API LIBYUV_API
@@ -140,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
int ARGBToG(const uint8* src_argb, int src_stride_argb,
uint8* dst_g, int dst_stride_g,
int width, int height);
// Convert ARGB To NV12. // Convert ARGB To NV12.
LIBYUV_API LIBYUV_API
int ARGBToNV12(const uint8* src_argb, int src_stride_argb, int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

View File

@@ -45,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y, uint8* dst_y, int dst_stride_y,
int width, int height); int width, int height);
#define J400ToJ400 I400ToI400
// Copy I422 to I422. // Copy I422 to I422.
#define I422ToI422 I422Copy #define I422ToI422 I422Copy
@@ -84,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height); int width, int height);
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height);
// Convert I420 to I400. (calls CopyPlane ignoring u/v). // Convert I420 to I400. (calls CopyPlane ignoring u/v).
LIBYUV_API LIBYUV_API
int I420ToI400(const uint8* src_y, int src_stride_y, int I420ToI400(const uint8* src_y, int src_stride_y,
@@ -93,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
int width, int height); int width, int height);
// Alias // Alias
#define J420ToJ400 I420ToI400
#define I420ToI420Mirror I420Mirror #define I420ToI420Mirror I420Mirror
// I420 mirror. // I420 mirror.
@@ -387,24 +401,24 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height, int interpolation); int width, int height, int interpolation);
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ #if defined(__pnacl__) || defined(__CLR_VER) || \
defined(TARGET_IPHONE_SIMULATOR) (defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_ARGBAFFINEROW_SSE2
#endif
// Row functions for copying a pixels from a source with a slope to a row // Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping. // of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API LIBYUV_API
void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width); uint8* dst_argb, const float* uv_dudv, int width);
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
LIBYUV_API LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* uv_dudv, int width); uint8* dst_argb, const float* uv_dudv, int width);
#define HAS_ARGBAFFINEROW_SSE2
#endif // LIBYUV_DISABLE_X86
// Shuffle ARGB channel order. e.g. BGRA to ARGB. // Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned. // shuffler is 16 bytes and must be aligned.

View File

@@ -0,0 +1,138 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT
#define INCLUDE_LIBYUV_ROTATE_ROW_H_
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#if defined(__pnacl__) || defined(__CLR_VER) || \
(defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86
#endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif
#endif
// The following are available for Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
// The following are available for 32 bit GCC:
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
// The following are available for 64 bit GCC but not NaCL:
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
#define HAS_TRANSPOSEWX8_FAST_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSEWX8_MIPS_DSPR2
#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
#endif // defined(__mips__)
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width, int height);
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height);
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT

View File

@@ -37,10 +37,8 @@ extern "C" {
free(var##_mem); \ free(var##_mem); \
var = 0 var = 0
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ #if defined(__pnacl__) || defined(__CLR_VER) || \
defined(TARGET_IPHONE_SIMULATOR) || \ (defined(__i386__) && !defined(__SSE2__))
(defined(__i386__) && !defined(__SSE2__)) || \
(defined(_MSC_VER) && defined(__clang__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// True if compiling for SSSE3 as a requirement. // True if compiling for SSSE3 as a requirement.
@@ -48,6 +46,9 @@ extern "C" {
#define LIBYUV_SSSE3_ONLY #define LIBYUV_SSSE3_ONLY
#endif #endif
#if defined(__native_client__)
#define LIBYUV_DISABLE_NEON
#endif
// clang >= 3.5.0 required for Arm64. // clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -63,11 +64,11 @@ extern "C" {
#define HAS_ABGRTOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBSETROW_X86
#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTOBAYERGGROW_SSE2
#define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2
@@ -95,7 +96,8 @@ extern "C" {
#define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3
// #define HAS_J422TOARGBROW_SSSE3 #define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2 #define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSSE3
@@ -112,15 +114,13 @@ extern "C" {
#define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGB565TOARGBROW_SSE2
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86
#define HAS_SETROW_ERMS #define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86 #define HAS_SETROW_X86
#define HAS_SPLITUVROW_SSE2 #define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUV422ROW_SSE2
#define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOUVROW_SSE2
#define HAS_UYVYTOYROW_SSE2 #define HAS_UYVYTOYROW_SSE2
#define HAS_YTOARGBROW_SSE2
#define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOARGBROW_SSSE3
#define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUVROW_SSE2
@@ -157,8 +157,9 @@ extern "C" {
#define HAS_SOBELYROW_SSE2 #define HAS_SOBELYROW_SSE2
#endif #endif
// The following are available on x64 Visual C: // The following are available on x64 Visual C and clangcl.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3
#endif #endif
@@ -177,27 +178,31 @@ extern "C" {
#endif // __clang__ #endif // __clang__
// Visual C 2012 required for AVX2. // Visual C 2012 required for AVX2.
#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700 #if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1 #define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012 #endif // VisualStudio >= 2012
// The following are available require VS2012. Port to GCC. // The following are available require VS2012. Port to GCC.
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 #define HAS_ARGB1555TOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_I422TORGB565ROW_AVX2 #define HAS_ARGBTORGB565ROW_AVX2
#define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_NV21TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#endif #endif
// The following are available on all x86 platforms, but // The following are available on all x86 platforms, but
@@ -214,24 +219,27 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX #define HAS_COPYROW_AVX
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
#define HAS_I422TORAWROW_AVX2
#define HAS_I422TORGB24ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_INTERPOLATEROW_AVX2 #define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2 #define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2 #define HAS_MIRRORROW_AVX2
#define HAS_SPLITUVROW_AVX2 #define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2
#define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2 #define HAS_UYVYTOYROW_AVX2
#define HAS_YTOARGBROW_AVX2 #define HAS_YUY2TOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOYROW_AVX2
// The following require HAS_I422TOARGBROW_AVX2
#if defined(HAS_I422TOARGBROW_AVX2)
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#endif
// Effects: // Effects:
#define HAS_ARGBADDROW_AVX2 #define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2
@@ -240,22 +248,6 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2
#endif #endif
// The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
(defined(_M_IX86) || defined(_M_X64) || \
defined(__x86_64__) || defined(__i386__))
#define HAS_MERGEUVROW_AVX2
#define HAS_MERGEUVROW_MMX
#define HAS_SPLITUVROW_AVX2
#define HAS_SPLITUVROW_MMX
#define HAS_UYVYTOYROW_AVX2
#define HAS_UYVYTOYROW_MMX
#define HAS_YUY2TOYROW_AVX2
#define HAS_YUY2TOYROW_MMX
#endif
// The following are disabled when SSSE3 is available: // The following are disabled when SSSE3 is available:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
@@ -278,7 +270,6 @@ extern "C" {
#define HAS_ARGB4444TOYROW_NEON #define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTOBAYERGGROW_NEON
#define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTORGB565ROW_NEON
@@ -292,7 +283,7 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON #define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON #define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON #define HAS_COPYROW_NEON
#define HAS_I400TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON
#define HAS_I422TOABGRROW_NEON #define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB1555ROW_NEON
@@ -331,11 +322,12 @@ extern "C" {
#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON #define HAS_UYVYTOYROW_NEON
#define HAS_YTOARGBROW_NEON #define HAS_I400TOARGBROW_NEON
#define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON #define HAS_YUY2TOYROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
// Effects: // Effects:
#define HAS_ARGBADDROW_NEON #define HAS_ARGBADDROW_NEON
@@ -388,7 +380,6 @@ typedef __declspec(align(32)) int8 lvec8[32];
typedef __declspec(align(32)) uint16 ulvec16[16]; typedef __declspec(align(32)) uint16 ulvec16[16];
typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32]; typedef __declspec(align(32)) uint8 ulvec8[32];
#elif defined(__GNUC__) #elif defined(__GNUC__)
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
@@ -869,6 +860,11 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix); int pix);
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix); int pix);
void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
@@ -884,12 +880,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
int pix); int pix);
void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
int pix); int pix);
void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
int pix); int pix);
void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
int pix);
void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
int pix);
void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
int pix);
void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
@@ -905,6 +909,13 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -914,6 +925,8 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -922,14 +935,13 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
const uint8* dither8x8, int pix); void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
void I444ToARGBRow_C(const uint8* src_y, void I444ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
@@ -1038,6 +1050,11 @@ void I444ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I444ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToARGBRow_SSSE3(const uint8* src_y, void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1048,6 +1065,11 @@ void I411ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I411ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void NV12ToARGBRow_SSSE3(const uint8* src_y, void NV12ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
@@ -1097,6 +1119,11 @@ void J422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_SSSE3(const uint8* src_y, void I422ToBGRARow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1147,11 +1174,21 @@ void I422ToRGB24Row_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_rgb24, uint8* dst_rgb24,
int width); int width);
void I422ToRGB24Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
int width);
void I422ToRAWRow_SSSE3(const uint8* src_y, void I422ToRAWRow_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_raw, uint8* dst_raw,
int width); int width);
void I422ToRAWRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_raw,
int width);
void I422ToARGBRow_Any_AVX2(const uint8* src_y, void I422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1177,6 +1214,11 @@ void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I444ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToARGBRow_Any_SSSE3(const uint8* src_y, void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1187,6 +1229,11 @@ void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I411ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
@@ -1231,6 +1278,16 @@ void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void J422ToARGBRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToBGRARow_Any_SSSE3(const uint8* src_y, void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1281,33 +1338,29 @@ void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void I422ToRAWRow_Any_SSSE3(const uint8* src_y, void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
uint8* dst_argb, uint8* dst_argb,
int width); int width);
void I422ToRAWRow_Any_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
int width);
void YToARGBRow_C(const uint8* src_y, void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
uint8* dst_argb, void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
int width); void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void YToARGBRow_SSE2(const uint8* src_y, void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
uint8* dst_argb, void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
int width); void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void YToARGBRow_AVX2(const uint8* src_y, void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
uint8* dst_argb,
int width);
void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_SSE2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_AVX2(const uint8* src_y,
uint8* dst_argb,
int width);
void YToARGBRow_Any_NEON(const uint8* src_y,
uint8* dst_argb,
int width);
// ARGB preattenuated alpha blend. // ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@@ -1375,6 +1428,11 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix);
void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1384,6 +1442,8 @@ void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width);
void I444ToARGBRow_Any_NEON(const uint8* src_y, void I444ToARGBRow_Any_NEON(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
@@ -1570,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
void I422ToYUY2Row_C(const uint8* src_y, void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1770,6 +1819,18 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width); uint8* dst_argb, int width);
void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width);
void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width);
void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void ARGBPolynomialRow_C(const uint8* src_argb, void ARGBPolynomialRow_C(const uint8* src_argb,
uint8* dst_argb, const float* poly, uint8* dst_argb, const float* poly,

View File

@@ -12,45 +12,66 @@
#define INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/scale.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ #if defined(__pnacl__) || defined(__CLR_VER) || \
defined(TARGET_IPHONE_SIMULATOR) (defined(__i386__) && !defined(__SSE2__))
#define LIBYUV_DISABLE_X86 #define LIBYUV_DISABLE_X86
#endif #endif
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && \
defined(_MSC_VER) && _MSC_VER >= 1700
#define VISUALC_HAS_AVX2 1
#endif // VisualStudio >= 2012
// The following are available on all x86 platforms: // The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SCALEROWDOWN2_SSE2 #define HAS_FIXEDDIV1_X86
#define HAS_SCALEROWDOWN4_SSE2 #define HAS_FIXEDDIV_X86
#define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEADDROWS_SSE2 #define HAS_SCALEARGBFILTERCOLS_SSSE3
#define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEARGBROWDOWN2_SSE2 #define HAS_SCALEARGBROWDOWN2_SSE2
#define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
#define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALECOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3 #define HAS_SCALEFILTERCOLS_SSSE3
#define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEROWDOWN2_SSE2
#define HAS_FIXEDDIV_X86 #define HAS_SCALEROWDOWN34_SSSE3
#define HAS_FIXEDDIV1_X86 #define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSE2
#endif
// The following are available on VS2012:
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_SCALEADDROW_AVX2
#define HAS_SCALEROWDOWN2_AVX2
#define HAS_SCALEROWDOWN4_AVX2
#endif
// The following are available on Visual C:
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
#define HAS_SCALEADDROW_SSE2
#endif #endif
// The following are available on Neon platforms: // The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
#define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN2_NEON
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBFILTERCOLS_NEON
#endif #endif
// The following are available on Mips platforms: // The following are available on Mips platforms:
@@ -164,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int dst_width); uint16* dst_ptr, int dst_width);
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
uint16* dst_ptr, int src_width, int src_height); void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height);
void ScaleARGBRowDown2_C(const uint8* src_argb, void ScaleARGBRowDown2_C(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
@@ -194,16 +213,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
// Specialized scalers for x86.
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
@@ -220,46 +251,124 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, uint8* dst_ptr, int dst_width);
int src_height); void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width); // ARGB Column functions
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx); int dst_width, int x, int dx);
// Row functions. void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width); uint8* dst_argb, int dst_width);
void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst, int dst_width); int src_stepx,
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_argb, int dst_width);
uint8* dst, int dst_width); void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
ptrdiff_t src_stride,
int src_stepx,
uint8* dst_argb, int dst_width);
// ScaleRowDown2Box also used by planar functions // ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation. // NEON downscalers with interpolation.
@@ -267,7 +376,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
// Note - not static due to reuse in convert for 444 to 420. // Note - not static due to reuse in convert for 444 to 420.
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
@@ -302,6 +412,42 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width);
void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32 -> 12
void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x3 -> 12x1
void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
// 32x2 -> 12x1
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx);
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width); uint8* dst, int dst_width);
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

View File

@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1305 #define LIBYUV_VERSION 1456
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@@ -37,7 +37,7 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
#define HAS_HASHDJB2_SSE41 #define HAS_HASHDJB2_SSE41
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
#if _MSC_VER >= 1700 #ifdef VISUALC_HAS_AVX2
#define HAS_HASHDJB2_AVX2 #define HAS_HASHDJB2_AVX2
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
#endif #endif
@@ -138,8 +138,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#define HAS_SUMSQUAREERROR_SSE2 #define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif #endif
// Visual C 2012 required for AVX2.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700 #ifdef VISUALC_HAS_AVX2
#define HAS_SUMSQUAREERROR_AVX2 #define HAS_SUMSQUAREERROR_AVX2
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
#endif #endif

View File

@@ -32,7 +32,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n" "usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n" "smlal v16.4s, v2.4h, v2.4h \n"

View File

@@ -16,9 +16,11 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
__asm { __asm {
mov eax, [esp + 4] // src_a mov eax, [esp + 4] // src_a
@@ -59,7 +61,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable: 4752) #pragma warning(disable: 4752)
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
__asm { __asm {
mov eax, [esp + 4] // src_a mov eax, [esp + 4] // src_a
@@ -133,7 +135,7 @@ static uvec32 kHashMul3 = {
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
_asm _emit 0x40 _asm _emit reg _asm _emit 0x40 _asm _emit reg
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
@@ -184,7 +186,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
// Visual C 2012 required for AVX2. // Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700 #if _MSC_VER >= 1700
__declspec(naked) __declspec(align(16)) __declspec(naked)
uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
__asm { __asm {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
@@ -219,8 +221,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
} }
} }
#endif // _MSC_VER >= 1700 #endif // _MSC_VER >= 1700
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"

View File

@@ -817,22 +817,20 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
src_stride_rgb24 = -src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24;
} }
// Neon version does direct RGB24 to YUV.
#if defined(HAS_RGB24TOYROW_NEON) #if defined(HAS_RGB24TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
RGB24ToYRow = RGB24ToYRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RGB24ToYRow = RGB24ToYRow_NEON; RGB24ToYRow = RGB24ToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
} }
} }
#endif // Other platforms do intermediate conversion from RGB24 to ARGB.
#if defined(HAS_RGB24TOUVROW_NEON) #else
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RGB24ToUVRow = RGB24ToUVRow_NEON;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_SSSE3) #if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@@ -841,27 +839,29 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{ {
#if !defined(HAS_RGB24TOYROW_NEON)
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15; const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
#endif #endif
@@ -894,8 +894,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
} }
#if !defined(HAS_RGB24TOYROW_NEON) #if !defined(HAS_RGB24TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif
} }
#endif
return 0; return 0;
} }
@@ -931,22 +931,20 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
src_stride_raw = -src_stride_raw; src_stride_raw = -src_stride_raw;
} }
// Neon version does direct RAW to YUV.
#if defined(HAS_RAWTOYROW_NEON) #if defined(HAS_RAWTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
RAWToYRow = RAWToYRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RAWToYRow = RAWToYRow_NEON; RAWToYRow = RAWToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
} }
} }
#endif // Other platforms do intermediate conversion from RAW to ARGB.
#if defined(HAS_RAWTOUVROW_NEON) #else
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVRow = RAWToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToUVRow = RAWToUVRow_NEON;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_SSSE3) #if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3; RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@@ -955,59 +953,63 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{ {
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15; const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RAWTOYROW_NEON) #if defined(HAS_RAWTOYROW_NEON)
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
#else #else
RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw, row, width);
RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width); ARGBToYRow(row, dst_y, width);
ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
#endif #endif
src_raw += src_stride_raw * 2; src_raw += src_stride_raw * 2;
dst_y += dst_stride_y * 2; dst_y += dst_stride_y * 2;
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
if (height & 1) { if (height & 1) {
#if defined(HAS_RAWTOYROW_NEON) #if defined(HAS_RAWTOYROW_NEON)
RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw, dst_y, width);
#else #else
RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw, row, width);
ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToUVRow(row, 0, dst_u, dst_v, width);
ARGBToYRow(row, dst_y, width); ARGBToYRow(row, dst_y, width);
#endif #endif
} }
#if !defined(HAS_RAWTOYROW_NEON) #if !defined(HAS_RAWTOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif
} }
#endif
return 0; return 0;
} }
@@ -1043,19 +1045,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
src_stride_rgb565 = -src_stride_rgb565; src_stride_rgb565 = -src_stride_rgb565;
} }
// Neon version does direct RGB565 to YUV.
#if defined(HAS_RGB565TOYROW_NEON) #if defined(HAS_RGB565TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
RGB565ToYRow = RGB565ToYRow_Any_NEON; RGB565ToYRow = RGB565ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
RGB565ToYRow = RGB565ToYRow_NEON; RGB565ToYRow = RGB565ToYRow_NEON;
} if (IS_ALIGNED(width, 16)) {
RGB565ToUVRow = RGB565ToUVRow_Any_NEON; RGB565ToUVRow = RGB565ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) { }
RGB565ToUVRow = RGB565ToUVRow_NEON;
} }
} }
#else // HAS_RGB565TOYROW_NEON // Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2) #if defined(HAS_RGB565TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
@@ -1064,28 +1067,37 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_RGB565TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif
#endif // HAS_RGB565TOYROW_NEON #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{ {
#if !defined(HAS_RGB565TOYROW_NEON)
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15; const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
#endif #endif
@@ -1118,8 +1130,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
} }
#if !defined(HAS_RGB565TOYROW_NEON) #if !defined(HAS_RGB565TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif
} }
#endif
return 0; return 0;
} }
@@ -1155,19 +1167,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
src_stride_argb1555 = -src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555;
} }
// Neon version does direct ARGB1555 to YUV.
#if defined(HAS_ARGB1555TOYROW_NEON) #if defined(HAS_ARGB1555TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGB1555ToYRow = ARGB1555ToYRow_NEON; ARGB1555ToYRow = ARGB1555ToYRow_NEON;
} if (IS_ALIGNED(width, 16)) {
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) { }
ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
} }
} }
#else // HAS_ARGB1555TOYROW_NEON // Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2) #if defined(HAS_ARGB1555TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
@@ -1176,30 +1189,40 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif
#endif // HAS_ARGB1555TOYROW_NEON #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{ {
#if !defined(HAS_ARGB1555TOYROW_NEON)
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15; const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
#endif #endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_ARGB1555TOYROW_NEON) #if defined(HAS_ARGB1555TOYROW_NEON)
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
@@ -1230,9 +1253,9 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
#endif #endif
} }
#if !defined(HAS_ARGB1555TOYROW_NEON) #if !defined(HAS_ARGB1555TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif
} }
#endif
return 0; return 0;
} }
@@ -1268,19 +1291,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
src_stride_argb4444 = -src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444;
} }
// Neon version does direct ARGB4444 to YUV.
#if defined(HAS_ARGB4444TOYROW_NEON) #if defined(HAS_ARGB4444TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGB4444ToYRow = ARGB4444ToYRow_NEON; ARGB4444ToYRow = ARGB4444ToYRow_NEON;
} if (IS_ALIGNED(width, 16)) {
ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
if (IS_ALIGNED(width, 16)) { }
ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
} }
} }
#else // HAS_ARGB4444TOYROW_NEON // Other platforms do intermediate conversion from ARGB4444 to ARGB.
#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2) #if defined(HAS_ARGB4444TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
@@ -1289,28 +1313,37 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3; ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOUVROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif
#endif // HAS_ARGB4444TOYROW_NEON #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
{ {
#if !defined(HAS_ARGB4444TOYROW_NEON)
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 15) & ~15; const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
#endif #endif
@@ -1345,8 +1378,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
} }
#if !defined(HAS_ARGB4444TOYROW_NEON) #if !defined(HAS_ARGB4444TOYROW_NEON)
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
#endif
} }
#endif
return 0; return 0;
} }

View File

@@ -85,6 +85,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I444ToARGBRow = I444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I444ToARGBRow = I444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I444TOARGBROW_NEON) #if defined(HAS_I444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I444ToARGBRow = I444ToARGBRow_Any_NEON; I444ToARGBRow = I444ToARGBRow_Any_NEON;
@@ -222,6 +230,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I411TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I411ToARGBRow = I411ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I411ToARGBRow = I411ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I411TOARGBROW_NEON) #if defined(HAS_I411TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I411ToARGBRow = I411ToARGBRow_Any_NEON; I411ToARGBRow = I411ToARGBRow_Any_NEON;
@@ -243,13 +259,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
// Convert I400 to ARGB. // Convert I400 to ARGB.
LIBYUV_API LIBYUV_API
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, int I400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*YToARGBRow)(const uint8* y_buf, void (*I400ToARGBRow)(const uint8* y_buf,
uint8* rgb_buf, uint8* rgb_buf,
int width) = YToARGBRow_C; int width) = I400ToARGBRow_C;
if (!src_y || !dst_argb || if (!src_y || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
@@ -267,47 +283,47 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
height = 1; height = 1;
src_stride_y = dst_stride_argb = 0; src_stride_y = dst_stride_argb = 0;
} }
#if defined(HAS_YTOARGBROW_SSE2) #if defined(HAS_I400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
YToARGBRow = YToARGBRow_Any_SSE2; I400ToARGBRow = I400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_SSE2; I400ToARGBRow = I400ToARGBRow_SSE2;
} }
} }
#endif #endif
#if defined(HAS_YTOARGBROW_AVX2) #if defined(HAS_I400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
YToARGBRow = YToARGBRow_Any_AVX2; I400ToARGBRow = I400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
YToARGBRow = YToARGBRow_AVX2; I400ToARGBRow = I400ToARGBRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_YTOARGBROW_NEON) #if defined(HAS_I400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
YToARGBRow = YToARGBRow_Any_NEON; I400ToARGBRow = I400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
YToARGBRow = YToARGBRow_NEON; I400ToARGBRow = I400ToARGBRow_NEON;
} }
} }
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
YToARGBRow(src_y, dst_argb, width); I400ToARGBRow(src_y, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
src_y += src_stride_y; src_y += src_stride_y;
} }
return 0; return 0;
} }
// Convert I400 to ARGB. // Convert J400 to ARGB.
LIBYUV_API LIBYUV_API
int I400ToARGB(const uint8* src_y, int src_stride_y, int J400ToARGB(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
I400ToARGBRow_C; J400ToARGBRow_C;
if (!src_y || !dst_argb || if (!src_y || !dst_argb ||
width <= 0 || height == 0) { width <= 0 || height == 0) {
return -1; return -1;
@@ -325,24 +341,32 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
height = 1; height = 1;
src_stride_y = dst_stride_argb = 0; src_stride_y = dst_stride_argb = 0;
} }
#if defined(HAS_I400TOARGBROW_SSE2) #if defined(HAS_J400TOARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
I400ToARGBRow = I400ToARGBRow_Any_SSE2; J400ToARGBRow = J400ToARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_SSE2; J400ToARGBRow = J400ToARGBRow_SSE2;
} }
} }
#endif #endif
#if defined(HAS_I400TOARGBROW_NEON) #if defined(HAS_J400TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
J400ToARGBRow = J400ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
J400ToARGBRow = J400ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_J400TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I400ToARGBRow = I400ToARGBRow_Any_NEON; J400ToARGBRow = J400ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I400ToARGBRow = I400ToARGBRow_NEON; J400ToARGBRow = J400ToARGBRow_NEON;
} }
} }
#endif #endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width); J400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y; src_y += src_stride_y;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
@@ -552,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
} }
} }
#endif #endif
#if defined(HAS_RGB565TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB565TOARGBROW_NEON) #if defined(HAS_RGB565TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
@@ -602,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
} }
} }
#endif #endif
#if defined(HAS_ARGB1555TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB1555TOARGBROW_NEON) #if defined(HAS_ARGB1555TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
@@ -652,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
} }
} }
#endif #endif
#if defined(HAS_ARGB4444TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_ARGB4444TOARGBROW_NEON) #if defined(HAS_ARGB4444TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;

View File

@@ -739,6 +739,14 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRGB24Row = I422ToRGB24Row_AVX2;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_NEON) #if defined(HAS_I422TORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToRGB24Row = I422ToRGB24Row_Any_NEON; I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
@@ -791,6 +799,14 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
} }
} }
#endif #endif
#if defined(HAS_I422TORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRAWRow = I422ToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToRAWRow = I422ToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_I422TORAWROW_NEON) #if defined(HAS_I422TORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
I422ToRAWRow = I422ToRAWRow_Any_NEON; I422ToRAWRow = I422ToRAWRow_Any_NEON;
@@ -993,6 +1009,117 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
static const uint8 kDither565_4x4[16] = {
0, 4, 1, 5,
6, 2, 7, 3,
1, 5, 0, 4,
7, 3, 6, 2,
};
// Convert I420 to RGB565 with dithering.
LIBYUV_API
int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither4x4, int width, int height) {
int y;
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) = I422ToARGBRow_C;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
dst_stride_rgb565 = -dst_stride_rgb565;
}
if (!dither4x4) {
dither4x4 = kDither565_4x4;
}
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
{
// Allocate a row of argb.
align_buffer_64(row_argb, width * 4);
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
*(uint32*)(dither4x4 + ((y & 3) << 2)), width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
src_u += src_stride_u;
src_v += src_stride_v;
}
}
free_aligned_buffer_64(row_argb);
}
return 0;
}
// Convert I420 to specified format // Convert I420 to specified format
LIBYUV_API LIBYUV_API
int ConvertFromI420(const uint8* y, int y_stride, int ConvertFromI420(const uint8* y, int y_stride,

View File

@@ -72,7 +72,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
ARGBToYRow = ARGBToYRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3;
} }
} }
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
@@ -139,7 +146,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -148,6 +154,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -275,6 +289,16 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -317,8 +341,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
#endif #endif
{ {
// Allocate a rows of uv. // Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15); uint8* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -374,6 +398,16 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -416,8 +450,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
#endif #endif
{ {
// Allocate a rows of uv. // Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8* row_v = row_u + ((halfwidth + 15) & ~15); uint8* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -492,6 +526,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -591,6 +633,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) #if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON; ARGBToYRow = ARGBToYRow_Any_NEON;
@@ -804,25 +854,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
static const uint8 kDither8x8[64] = { // Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
0, 128, 32, 160, 8, 136, 40, 168, static const uint8 kDither565_4x4[16] = {
192, 64, 224, 96, 200, 72, 232, 104, 0, 4, 1, 5,
48, 176, 16, 144, 56, 184, 24, 152, 6, 2, 7, 3,
240, 112, 208, 80, 248, 120, 216, 88, 1, 5, 0, 4,
12, 140, 44, 172, 4, 132, 36, 164, 7, 3, 6, 2,
204, 76, 236, 108, 196, 68, 228, 100,
60, 188, 28, 156, 52, 180, 20, 148,
252, 124, 220, 92, 244, 116, 212, 84,
}; };
// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
LIBYUV_API LIBYUV_API
int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
const uint8* dither8x8, int width, int height) { const uint8* dither4x4, int width, int height) {
int y; int y;
void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C; const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1; return -1;
} }
@@ -831,13 +878,36 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb; src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
if (!dither8x8) { if (!dither4x4) {
dither8x8 = kDither8x8; dither4x4 = kDither565_4x4;
} }
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565, ARGBToRGB565DitherRow(src_argb, dst_rgb565,
dither8x8 + ((y & 7) << 3), width); *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565; dst_rgb565 += dst_stride_rgb565;
} }
@@ -845,6 +915,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
} }
// Convert ARGB To RGB565. // Convert ARGB To RGB565.
// TODO(fbarchard): Consider using dither function low level with zeros.
LIBYUV_API LIBYUV_API
int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
uint8* dst_rgb565, int dst_stride_rgb565, uint8* dst_rgb565, int dst_stride_rgb565,
@@ -1021,7 +1092,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
int width, int height) { int width, int height) {
int y; int y;
void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C; ARGBToYJRow_C;
if (!src_argb || if (!src_argb ||
@@ -1045,7 +1116,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) #if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2; ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@@ -1140,6 +1211,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON) #if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON; ARGBToYJRow = ARGBToYJRow_Any_NEON;

View File

@@ -10,13 +10,12 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#if defined(_MSC_VER) && !defined(__clang__) #if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
#include <intrin.h> // For __cpuidex() #include <intrin.h> // For __cpuidex()
#endif #endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \ #if !defined(__pnacl__) && !defined(__CLR_VER) && \
!defined(__native_client__) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \ defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
(defined(_M_IX86) || defined(_M_X64))
#include <immintrin.h> // For _xgetbv() #include <immintrin.h> // For _xgetbv()
#endif #endif
@@ -37,23 +36,23 @@ extern "C" {
// For functions that use the stack and have runtime checks for overflow, // For functions that use the stack and have runtime checks for overflow,
// use SAFEBUFFERS to avoid additional check. // use SAFEBUFFERS to avoid additional check.
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) #if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
#define SAFEBUFFERS __declspec(safebuffers) #define SAFEBUFFERS __declspec(safebuffers)
#else #else
#define SAFEBUFFERS #define SAFEBUFFERS
#endif #endif
// Low level cpuid for X86. Returns zeros on other CPUs. // Low level cpuid for X86.
#if !defined(__pnacl__) && !defined(__CLR_VER) && \ #if (defined(_M_IX86) || defined(_M_X64) || \
(defined(_M_IX86) || defined(_M_X64) || \ defined(__i386__) || defined(__x86_64__)) && \
defined(__i386__) || defined(__x86_64__)) !defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
#if defined(_MSC_VER) && !defined(__clang__) #if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
// Visual C version uses intrinsic or inline x86 assembly.
#if (_MSC_FULL_VER >= 160040219) #if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx); __cpuidex((int*)(cpu_info), info_eax, info_ecx);
#endif #elif defined(_M_IX86)
#if defined(_M_IX86)
__asm { __asm {
mov eax, info_eax mov eax, info_eax
mov ecx, info_ecx mov ecx, info_ecx
@@ -71,7 +70,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
} }
#endif #endif
#else // defined(_MSC_VER) // GCC version uses inline x86 assembly.
#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
uint32 info_ebx, info_edx; uint32 info_ebx, info_edx;
asm volatile ( // NOLINT asm volatile ( // NOLINT
#if defined( __i386__) && defined(__PIC__) #if defined( __i386__) && defined(__PIC__)
@@ -89,37 +89,38 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
cpu_info[1] = info_ebx; cpu_info[1] = info_ebx;
cpu_info[2] = info_ecx; cpu_info[2] = info_ecx;
cpu_info[3] = info_edx; cpu_info[3] = info_edx;
#endif // defined(_MSC_VER) #endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
} }
#else // (defined(_M_IX86) || defined(_M_X64) ...
#if !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#endif
#if defined(_M_IX86) && defined(_MSC_VER)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#endif
#if defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(_MSC_VER)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // !defined(__native_client__)
#else
LIBYUV_API LIBYUV_API
void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
} }
#endif #endif
// TODO(fbarchard): Enable xgetbv when validator supports it.
#if (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
#define HAS_XGETBV
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int TestOsSaveYmm() {
uint32 xcr0 = 0u;
#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
__asm {
xor ecx, ecx // xcr 0
_asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
mov xcr0, eax
}
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
return((xcr0 & 6) == 6); // Is ymm saved?
}
#endif // defined(_M_IX86) || defined(_M_X64) ..
// based on libvpx arm_cpudetect.c // based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU // For Arm, but public to allow testing on any CPU
LIBYUV_API SAFEBUFFERS LIBYUV_API SAFEBUFFERS

View File

@@ -18,6 +18,12 @@
// Must be included before jpeglib. // Must be included before jpeglib.
#include <setjmp.h> #include <setjmp.h>
#define HAVE_SETJMP #define HAVE_SETJMP
#if defined(_MSC_VER)
// disable warning 4324: structure was padded due to __declspec(align())
#pragma warning(disable:4324)
#endif
#endif #endif
struct FILE; // For jpeglib.h. struct FILE; // For jpeglib.h.

View File

@@ -23,7 +23,7 @@ extern "C" {
#ifdef ENABLE_SCASB #ifdef ENABLE_SCASB
// Multiple of 1. // Multiple of 1.
__declspec(naked) __declspec(align(16)) __declspec(naked)
const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
__asm { __asm {
mov edx, edi mov edx, edi

View File

@@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
return 0; return 0;
} }
// Get a blender that optimized for the CPU, alignment and pixel count. // Get a blender that optimized for the CPU and pixel count.
// As there are 6 blenders to choose from, the caller should try to use // As there are 6 blenders to choose from, the caller should try to use
// the same blend function for all pixels if possible. // the same blend function for all pixels if possible.
LIBYUV_API LIBYUV_API
@@ -677,12 +677,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
height = 1; height = 1;
src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
} }
#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER) #if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_SSE2; ARGBAddRow = ARGBAddRow_SSE2;
} }
#endif #endif
#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER) #if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ARGBAddRow = ARGBAddRow_Any_SSE2; ARGBAddRow = ARGBAddRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
@@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
const uint8* src_sobely, const uint8* src_sobely,
uint8* dst, int width)) { uint8* dst, int width)) {
int y; int y;
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
uint32 selector, int pix) = ARGBToBayerGGRow_C; ARGBToYJRow_C;
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) = SobelYRow_C; uint8* dst_sobely, int width) = SobelYRow_C;
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
@@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb; src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2) #if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
}
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 16)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOBAYERGGROW_NEON) #if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_NEON; ARGBToYJRow = ARGBToYJRow_NEON;
} }
} }
#endif #endif
#if defined(HAS_SOBELYROW_SSE2) #if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
SobelYRow = SobelYRow_SSE2; SobelYRow = SobelYRow_SSE2;
@@ -2040,7 +2041,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
#endif #endif
{ {
// 3 rows with edges before/after. // 3 rows with edges before/after.
const int kRowSize = (width + kEdge + 15) & ~15; const int kRowSize = (width + kEdge + 31) & ~31;
align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
uint8* row_sobelx = rows; uint8* row_sobelx = rows;
uint8* row_sobely = rows + kRowSize; uint8* row_sobely = rows + kRowSize;
@@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
uint8* row_y0 = row_y + kEdge; uint8* row_y0 = row_y + kEdge;
uint8* row_y1 = row_y0 + kRowSize; uint8* row_y1 = row_y0 + kRowSize;
uint8* row_y2 = row_y1 + kRowSize; uint8* row_y2 = row_y1 + kRowSize;
ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0]; row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); ARGBToYJRow(src_argb, row_y1, width);
row_y1[-1] = row_y1[0]; row_y1[-1] = row_y1[0];
memset(row_y1 + width, row_y1[width - 1], 16); memset(row_y1 + width, row_y1[width - 1], 16);
memset(row_y2 + width, 0, 16); memset(row_y2 + width, 0, 16);
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
// Convert next row of ARGB to Y. // Convert next row of ARGB to G.
if (y < (height - 1)) { if (y < (height - 1)) {
src_argb += src_stride_argb; src_argb += src_stride_argb;
} }
ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); ARGBToYJRow(src_argb, row_y2, width);
row_y2[-1] = row_y2[0]; row_y2[-1] = row_y2[0];
row_y2[width] = row_y2[width - 1]; row_y2[width] = row_y2[width - 1];
@@ -2094,13 +2095,19 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C; uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2) #if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasSSE2)) {
SobelRow = SobelRow_SSE2; SobelRow = SobelRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
} }
#endif #endif
#if defined(HAS_SOBELROW_NEON) #if defined(HAS_SOBELROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasNEON)) {
SobelRow = SobelRow_NEON; SobelRow = SobelRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
SobelRow = SobelRow_NEON;
}
} }
#endif #endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2115,13 +2122,19 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C; uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2) #if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasSSE2)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2; SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
} }
#endif #endif
#if defined(HAS_SOBELTOPLANEROW_NEON) #if defined(HAS_SOBELTOPLANEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasNEON)) {
SobelToPlaneRow = SobelToPlaneRow_NEON; SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_NEON;
}
} }
#endif #endif
return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
@@ -2137,13 +2150,19 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C; uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2) #if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasSSE2)) {
SobelXYRow = SobelXYRow_SSE2; SobelXYRow = SobelXYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
} }
#endif #endif
#if defined(HAS_SOBELXYROW_NEON) #if defined(HAS_SOBELXYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasNEON)) {
SobelXYRow = SobelXYRow_NEON; SobelXYRow = SobelXYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
SobelXYRow = SobelXYRow_NEON;
}
} }
#endif #endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
@@ -2322,6 +2341,214 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
return 0; return 0;
} }
LIBYUV_API
int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_yuy2 ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
#if defined(HAS_SPLITUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
{
int awidth = halfwidth * 2;
// 2 rows of uv
align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
SplitUVRow(src_yuy2, dst_y, rows, awidth);
SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
rows + awidth, awidth);
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_yuy2 += src_stride_yuy2 * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
// Split Y from UV.
SplitUVRow(src_yuy2, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
return 0;
}
LIBYUV_API
int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
int width, int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_uyvy ||
!dst_y || !dst_uv ||
width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
#if defined(HAS_SPLITUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
{
int awidth = halfwidth * 2;
// 2 rows of uv
align_buffer_64(rows, awidth * 2);
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
SplitUVRow(src_uyvy, rows, dst_y, awidth);
SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
dst_y + dst_stride_y, awidth);
InterpolateRow(dst_uv, rows, awidth, awidth, 128);
src_uyvy += src_stride_uyvy * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
// Split Y from UV.
SplitUVRow(src_uyvy, dst_y, dst_uv, width);
}
free_aligned_buffer_64(rows);
}
return 0;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@@ -13,6 +13,7 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/convert.h" #include "libyuv/convert.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate_row.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#ifdef __cplusplus #ifdef __cplusplus
@@ -20,809 +21,39 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#if defined(__APPLE__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".private_extern _" #name " \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
"_" #name ": \n"
#else
#define DECLARE_FUNCTION(name) \
".text \n" \
".align 4,0x90 \n" \
#name ": \n"
#endif
#endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE_WX8_NEON
void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWX8_NEON
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif
#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width);
#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width);
#endif // defined(__mips__)
#if !defined(LIBYUV_DISABLE_X86) && \
defined(_M_IX86) && defined(_MSC_VER)
#define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
#define HAS_TRANSPOSE_UVWX8_SSE2
__declspec(naked) __declspec(align(16))
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
#define HAS_TRANSPOSE_UVWX8_SSE2
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(w) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
}
#endif
#endif
static void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
}
static void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
for (j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
}
}
}
LIBYUV_API LIBYUV_API
void TransposePlane(const uint8* src, int src_stride, void TransposePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride,
int width, int height) { int width, int height) {
int i = height; int i = height;
void (*TransposeWx8)(const uint8* src, int src_stride, void (*TransposeWx8)(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width) = TransposeWx8_C;
int width) = TransposeWx8_C; #if defined(HAS_TRANSPOSEWX8_NEON)
#if defined(HAS_TRANSPOSE_WX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON; TransposeWx8 = TransposeWx8_NEON;
} }
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_SSSE3) #if defined(HAS_TRANSPOSEWX8_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_SSSE3; TransposeWx8 = TransposeWx8_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_SSSE3;
}
} }
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_FAST_SSSE3; TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
TransposeWx8 = TransposeWx8_Fast_SSSE3;
}
} }
#endif #endif
#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) #if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
if (IS_ALIGNED(width, 4) && if (IS_ALIGNED(width, 4) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
} else { } else {
TransposeWx8 = TransposeWx8_MIPS_DSPR2; TransposeWx8 = TransposeWx8_MIPS_DSPR2;
} }
@@ -837,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride,
i -= 8; i -= 8;
} }
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); if (i > 0) {
TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
}
} }
LIBYUV_API LIBYUV_API
@@ -955,48 +188,6 @@ void RotatePlane180(const uint8* src, int src_stride,
free_aligned_buffer_64(row); free_aligned_buffer_64(row);
} }
static void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
}
}
static void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
}
LIBYUV_API LIBYUV_API
void TransposeUV(const uint8* src, int src_stride, void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
@@ -1007,17 +198,17 @@ void TransposeUV(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a, uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, uint8* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C; int width) = TransposeUVWx8_C;
#if defined(HAS_TRANSPOSE_UVWX8_NEON) #if defined(HAS_TRANSPOSEUVWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
TransposeUVWx8 = TransposeUVWx8_NEON; TransposeUVWx8 = TransposeUVWx8_NEON;
} }
#endif #endif
#if defined(HAS_TRANSPOSE_UVWX8_SSE2) #if defined(HAS_TRANSPOSEUVWX8_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
TransposeUVWx8 = TransposeUVWx8_SSE2; TransposeUVWx8 = TransposeUVWx8_SSE2;
} }
#endif #endif
#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) #if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@@ -1036,10 +227,12 @@ void TransposeUV(const uint8* src, int src_stride,
i -= 8; i -= 8;
} }
TransposeUVWxH_C(src, src_stride, if (i > 0) {
dst_a, dst_stride_a, TransposeUVWxH_C(src, src_stride,
dst_b, dst_stride_b, dst_a, dst_stride_a,
width, i); dst_b, dst_stride_b,
width, i);
}
} }
LIBYUV_API LIBYUV_API

55
third_party/libyuv/source/rotate_any.cc vendored Normal file
View File

@@ -0,0 +1,55 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/rotate.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \
void NAMEANY(const uint8* src, int src_stride, \
uint8* dst, int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
TPOS_SIMD(src, src_stride, dst, dst_stride, n); \
} \
TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
}
#ifdef HAS_TRANSPOSEWX8_NEON
TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
#endif
#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
#endif
#undef TANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -27,24 +27,20 @@ extern "C" {
(defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx, int src_stepx, uint8* dst_ptr, int dst_width);
uint8* dst_ptr, int dst_width);
#endif #endif
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
int src_stepx, int src_stepx, uint8* dst_ptr, int dst_width);
uint8* dst_ptr, int dst_width);
#endif #endif
void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
int src_stepx, int src_stepx, uint8* dst_ptr, int dst_width);
uint8* dst_ptr, int dst_width);
static void ARGBTranspose(const uint8* src, int src_stride, static void ARGBTranspose(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width, int height) {
int width, int height) {
int i; int i;
int src_pixel_step = src_stride >> 2; int src_pixel_step = src_stride >> 2;
void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
@@ -68,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride,
} }
void ARGBRotate90(const uint8* src, int src_stride, void ARGBRotate90(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width, int height) {
int width, int height) {
// Rotate by 90 is a ARGBTranspose with the source read // Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end // from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride. // of the buffer and flip the sign of the source stride.
@@ -79,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
} }
void ARGBRotate270(const uint8* src, int src_stride, void ARGBRotate270(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width, int height) {
int width, int height) {
// Rotate by 270 is a ARGBTranspose with the destination written // Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end // from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride. // of the buffer and flip the sign of the destination stride.
@@ -90,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
} }
void ARGBRotate180(const uint8* src, int src_stride, void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width, int height) {
int width, int height) {
// Swap first and last row and mirror the content. Uses a temporary row. // Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4); align_buffer_64(row, width * 4);
const uint8* src_bot = src + src_stride * (height - 1); const uint8* src_bot = src + src_stride * (height - 1);
@@ -166,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
LIBYUV_API LIBYUV_API
int ARGBRotate(const uint8* src_argb, int src_stride_argb, int ARGBRotate(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height,
int width, int height,
enum RotationMode mode) { enum RotationMode mode) {
if (!src_argb || width <= 0 || height == 0 || !dst_argb) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
return -1; return -1;

View File

@@ -0,0 +1,92 @@
/*
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
void TransposeWx8_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
int i;
for (i = 0; i < width; ++i) {
dst[0] = src[0 * src_stride];
dst[1] = src[1 * src_stride];
dst[2] = src[2 * src_stride];
dst[3] = src[3 * src_stride];
dst[4] = src[4 * src_stride];
dst[5] = src[5 * src_stride];
dst[6] = src[6 * src_stride];
dst[7] = src[7 * src_stride];
++src;
dst += dst_stride;
}
}
void TransposeUVWx8_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
int i;
for (i = 0; i < width; ++i) {
dst_a[0] = src[0 * src_stride + 0];
dst_b[0] = src[0 * src_stride + 1];
dst_a[1] = src[1 * src_stride + 0];
dst_b[1] = src[1 * src_stride + 1];
dst_a[2] = src[2 * src_stride + 0];
dst_b[2] = src[2 * src_stride + 1];
dst_a[3] = src[3 * src_stride + 0];
dst_b[3] = src[3 * src_stride + 1];
dst_a[4] = src[4 * src_stride + 0];
dst_b[4] = src[4 * src_stride + 1];
dst_a[5] = src[5 * src_stride + 0];
dst_b[5] = src[5 * src_stride + 1];
dst_a[6] = src[6 * src_stride + 0];
dst_b[6] = src[6 * src_stride + 1];
dst_a[7] = src[7 * src_stride + 0];
dst_b[7] = src[7 * src_stride + 1];
src += 2;
dst_a += dst_stride_a;
dst_b += dst_stride_b;
}
}
void TransposeWxH_C(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
int i;
for (i = 0; i < width; ++i) {
int j;
for (j = 0; j < height; ++j) {
dst[i * dst_stride + j] = src[j * src_stride + i];
}
}
}
void TransposeUVWxH_C(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int width, int height) {
int i;
for (i = 0; i < width * 2; i += 2) {
int j;
for (j = 0; j < height; ++j) {
dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
}
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

493
third_party/libyuv/source/rotate_gcc.cc vendored Normal file
View File

@@ -0,0 +1,493 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width);
asm (
DECLARE_FUNCTION(TransposeUVWx8_SSE2)
"push %ebx \n"
"push %esi \n"
"push %edi \n"
"push %ebp \n"
"mov 0x14(%esp),%eax \n"
"mov 0x18(%esp),%edi \n"
"mov 0x1c(%esp),%edx \n"
"mov 0x20(%esp),%esi \n"
"mov 0x24(%esp),%ebx \n"
"mov 0x28(%esp),%ebp \n"
"mov %esp,%ecx \n"
"sub $0x14,%esp \n"
"and $0xfffffff0,%esp \n"
"mov %ecx,0x10(%esp) \n"
"mov 0x2c(%ecx),%ecx \n"
"1: \n"
"movdqu (%eax),%xmm0 \n"
"movdqu (%eax,%edi,1),%xmm1 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm0,%xmm7 \n"
"punpcklbw %xmm1,%xmm0 \n"
"punpckhbw %xmm1,%xmm7 \n"
"movdqa %xmm7,%xmm1 \n"
"movdqu (%eax),%xmm2 \n"
"movdqu (%eax,%edi,1),%xmm3 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm2,%xmm7 \n"
"punpcklbw %xmm3,%xmm2 \n"
"punpckhbw %xmm3,%xmm7 \n"
"movdqa %xmm7,%xmm3 \n"
"movdqu (%eax),%xmm4 \n"
"movdqu (%eax,%edi,1),%xmm5 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqa %xmm4,%xmm7 \n"
"punpcklbw %xmm5,%xmm4 \n"
"punpckhbw %xmm5,%xmm7 \n"
"movdqa %xmm7,%xmm5 \n"
"movdqu (%eax),%xmm6 \n"
"movdqu (%eax,%edi,1),%xmm7 \n"
"lea (%eax,%edi,2),%eax \n"
"movdqu %xmm5,(%esp) \n"
"neg %edi \n"
"movdqa %xmm6,%xmm5 \n"
"punpcklbw %xmm7,%xmm6 \n"
"punpckhbw %xmm7,%xmm5 \n"
"movdqa %xmm5,%xmm7 \n"
"lea 0x10(%eax,%edi,8),%eax \n"
"neg %edi \n"
"movdqa %xmm0,%xmm5 \n"
"punpcklwd %xmm2,%xmm0 \n"
"punpckhwd %xmm2,%xmm5 \n"
"movdqa %xmm5,%xmm2 \n"
"movdqa %xmm1,%xmm5 \n"
"punpcklwd %xmm3,%xmm1 \n"
"punpckhwd %xmm3,%xmm5 \n"
"movdqa %xmm5,%xmm3 \n"
"movdqa %xmm4,%xmm5 \n"
"punpcklwd %xmm6,%xmm4 \n"
"punpckhwd %xmm6,%xmm5 \n"
"movdqa %xmm5,%xmm6 \n"
"movdqu (%esp),%xmm5 \n"
"movdqu %xmm6,(%esp) \n"
"movdqa %xmm5,%xmm6 \n"
"punpcklwd %xmm7,%xmm5 \n"
"punpckhwd %xmm7,%xmm6 \n"
"movdqa %xmm6,%xmm7 \n"
"movdqa %xmm0,%xmm6 \n"
"punpckldq %xmm4,%xmm0 \n"
"punpckhdq %xmm4,%xmm6 \n"
"movdqa %xmm6,%xmm4 \n"
"movdqu (%esp),%xmm6 \n"
"movlpd %xmm0,(%edx) \n"
"movhpd %xmm0,(%ebx) \n"
"movlpd %xmm4,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm4,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm2,%xmm0 \n"
"punpckldq %xmm6,%xmm2 \n"
"movlpd %xmm2,(%edx) \n"
"movhpd %xmm2,(%ebx) \n"
"punpckhdq %xmm6,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm1,%xmm0 \n"
"punpckldq %xmm5,%xmm1 \n"
"movlpd %xmm1,(%edx) \n"
"movhpd %xmm1,(%ebx) \n"
"punpckhdq %xmm5,%xmm0 \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"movdqa %xmm3,%xmm0 \n"
"punpckldq %xmm7,%xmm3 \n"
"movlpd %xmm3,(%edx) \n"
"movhpd %xmm3,(%ebx) \n"
"punpckhdq %xmm7,%xmm0 \n"
"sub $0x8,%ecx \n"
"movlpd %xmm0,(%edx,%esi,1) \n"
"lea (%edx,%esi,2),%edx \n"
"movhpd %xmm0,(%ebx,%ebp,1) \n"
"lea (%ebx,%ebp,2),%ebx \n"
"jg 1b \n"
"mov 0x10(%esp),%esp \n"
"pop %ebp \n"
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
#if defined(__native_client__)
"pop %ecx \n"
"and $0xffffffe0,%ecx \n"
"jmp *%ecx \n"
#else
"ret \n"
#endif
);
#endif
#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n"
// Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap.
// Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
);
}
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
".p2align 2 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n"
// Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap.
// Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9"
);
}
#endif
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -9,6 +9,7 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
@@ -22,8 +23,7 @@ extern "C" {
(_MIPS_SIM == _MIPS_SIM_ABI32) (_MIPS_SIM == _MIPS_SIM_ABI32)
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width) {
int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set push \n" ".set push \n"
".set noreorder \n" ".set noreorder \n"
@@ -106,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
); );
} }
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width) {
int width) {
__asm__ __volatile__ ( __asm__ __volatile__ (
".set noat \n" ".set noat \n"
".set push \n" ".set push \n"

View File

@@ -9,6 +9,7 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"

View File

@@ -9,6 +9,7 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
@@ -21,11 +22,10 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
static uvec8 kVTbl4x4Transpose = static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride, void TransposeWx8_NEON(const uint8* src, int src_stride,
uint8* dst, int dst_stride, uint8* dst, int dst_stride, int width) {
int width) {
const uint8* src_temp = NULL; const uint8* src_temp = NULL;
int64 width64 = (int64) width; // Work around clang 3.4 warning. int64 width64 = (int64) width; // Work around clang 3.4 warning.
asm volatile ( asm volatile (

248
third_party/libyuv/source/rotate_win.cc vendored Normal file
View File

@@ -0,0 +1,248 @@
/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/rotate_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
__declspec(naked)
void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
__asm {
push edi
push esi
push ebp
mov eax, [esp + 12 + 4] // src
mov edi, [esp + 12 + 8] // src_stride
mov edx, [esp + 12 + 12] // dst
mov esi, [esp + 12 + 16] // dst_stride
mov ecx, [esp + 12 + 20] // width
// Read in the data from the source pointer.
// First round of bit swap.
align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
movq xmm1, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm0, xmm1
movq xmm2, qword ptr [eax]
movdqa xmm1, xmm0
palignr xmm1, xmm1, 8
movq xmm3, qword ptr [eax + edi]
lea eax, [eax + 2 * edi]
punpcklbw xmm2, xmm3
movdqa xmm3, xmm2
movq xmm4, qword ptr [eax]
palignr xmm3, xmm3, 8
movq xmm5, qword ptr [eax + edi]
punpcklbw xmm4, xmm5
lea eax, [eax + 2 * edi]
movdqa xmm5, xmm4
movq xmm6, qword ptr [eax]
palignr xmm5, xmm5, 8
movq xmm7, qword ptr [eax + edi]
punpcklbw xmm6, xmm7
mov eax, ebp
movdqa xmm7, xmm6
palignr xmm7, xmm7, 8
// Second round of bit swap.
punpcklwd xmm0, xmm2
punpcklwd xmm1, xmm3
movdqa xmm2, xmm0
movdqa xmm3, xmm1
palignr xmm2, xmm2, 8
palignr xmm3, xmm3, 8
punpcklwd xmm4, xmm6
punpcklwd xmm5, xmm7
movdqa xmm6, xmm4
movdqa xmm7, xmm5
palignr xmm6, xmm6, 8
palignr xmm7, xmm7, 8
// Third round of bit swap.
// Write to the destination pointer.
punpckldq xmm0, xmm4
movq qword ptr [edx], xmm0
movdqa xmm4, xmm0
palignr xmm4, xmm4, 8
movq qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
punpckldq xmm2, xmm6
movdqa xmm6, xmm2
palignr xmm6, xmm6, 8
movq qword ptr [edx], xmm2
punpckldq xmm1, xmm5
movq qword ptr [edx + esi], xmm6
lea edx, [edx + 2 * esi]
movdqa xmm5, xmm1
movq qword ptr [edx], xmm1
palignr xmm5, xmm5, 8
punpckldq xmm3, xmm7
movq qword ptr [edx + esi], xmm5
lea edx, [edx + 2 * esi]
movq qword ptr [edx], xmm3
movdqa xmm7, xmm3
palignr xmm7, xmm7, 8
sub ecx, 8
movq qword ptr [edx + esi], xmm7
lea edx, [edx + 2 * esi]
jg convertloop
pop ebp
pop esi
pop edi
ret
}
}
__declspec(naked)
void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
uint8* dst_b, int dst_stride_b,
int w) {
__asm {
push ebx
push esi
push edi
push ebp
mov eax, [esp + 16 + 4] // src
mov edi, [esp + 16 + 8] // src_stride
mov edx, [esp + 16 + 12] // dst_a
mov esi, [esp + 16 + 16] // dst_stride_a
mov ebx, [esp + 16 + 20] // dst_b
mov ebp, [esp + 16 + 24] // dst_stride_b
mov ecx, esp
sub esp, 4 + 16
and esp, ~15
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
movdqu xmm0, [eax]
movdqu xmm1, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm0 // use xmm7 as temp register.
punpcklbw xmm0, xmm1
punpckhbw xmm7, xmm1
movdqa xmm1, xmm7
movdqu xmm2, [eax]
movdqu xmm3, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm2
punpcklbw xmm2, xmm3
punpckhbw xmm7, xmm3
movdqa xmm3, xmm7
movdqu xmm4, [eax]
movdqu xmm5, [eax + edi]
lea eax, [eax + 2 * edi]
movdqa xmm7, xmm4
punpcklbw xmm4, xmm5
punpckhbw xmm7, xmm5
movdqa xmm5, xmm7
movdqu xmm6, [eax]
movdqu xmm7, [eax + edi]
lea eax, [eax + 2 * edi]
movdqu [esp], xmm5 // backup xmm5
neg edi
movdqa xmm5, xmm6 // use xmm5 as temp register.
punpcklbw xmm6, xmm7
punpckhbw xmm5, xmm7
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
movdqa xmm2, xmm5
movdqa xmm5, xmm1
punpcklwd xmm1, xmm3
punpckhwd xmm5, xmm3
movdqa xmm3, xmm5
movdqa xmm5, xmm4
punpcklwd xmm4, xmm6
punpckhwd xmm5, xmm6
movdqa xmm6, xmm5
movdqu xmm5, [esp] // restore xmm5
movdqu [esp], xmm6 // backup xmm6
movdqa xmm6, xmm5 // use xmm6 as temp register.
punpcklwd xmm5, xmm7
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
movdqa xmm4, xmm6
movdqu xmm6, [esp] // restore xmm6
movlpd qword ptr [edx], xmm0
movhpd qword ptr [ebx], xmm0
movlpd qword ptr [edx + esi], xmm4
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm4
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm2 // use xmm0 as the temp register.
punpckldq xmm2, xmm6
movlpd qword ptr [edx], xmm2
movhpd qword ptr [ebx], xmm2
punpckhdq xmm0, xmm6
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm1 // use xmm0 as the temp register.
punpckldq xmm1, xmm5
movlpd qword ptr [edx], xmm1
movhpd qword ptr [ebx], xmm1
punpckhdq xmm0, xmm5
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
movdqa xmm0, xmm3 // use xmm0 as the temp register.
punpckldq xmm3, xmm7
movlpd qword ptr [edx], xmm3
movhpd qword ptr [ebx], xmm3
punpckhdq xmm0, xmm7
sub ecx, 8
movlpd qword ptr [edx + esi], xmm0
lea edx, [edx + 2 * esi]
movhpd qword ptr [ebx + ebp], xmm0
lea ebx, [ebx + 2 * ebp]
jg convertloop
mov esp, [esp + 16]
pop ebp
pop edi
pop esi
pop ebx
ret
}
}
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -199,28 +199,36 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
} }
} }
// dither4 is a row of 4 values from 4x4 dither matrix.
// The 4x4 matrix contains values to increase RGB. When converting to
// fewer bits (565) this provides an ordered dither.
// The order in the 4x4 matrix in first byte is upper left.
// The 4 values are passed as an int, then referenced as an array, so
// endian will not affect order of the original matrix. But the dither4
// will containing the first pixel in the lower byte for little endian
// or the upper byte for big endian.
void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
const uint8* dither8x8, int width) { const uint32 dither4, int width) {
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
int dither0 = dither8x8[x & 7] - 128; int dither0 = ((const unsigned char*)(&dither4))[x & 3];
int dither1 = dither8x8[(x & 7) + 1] - 128; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
uint8 b1 = Clamp(src_argb[4] + dither1) >> 3; uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
uint8 g1 = Clamp(src_argb[5] + dither1) >> 2; uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
uint8 r1 = Clamp(src_argb[6] + dither1) >> 3; uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
(b1 << 16) | (g1 << 21) | (r1 << 27)); (b1 << 16) | (g1 << 21) | (r1 << 27));
dst_rgb += 4; dst_rgb += 4;
src_argb += 8; src_argb += 8;
} }
if (width & 1) { if (width & 1) {
int dither0 = dither8x8[(width - 1) & 7] - 128; int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
*(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
} }
} }
@@ -974,7 +982,7 @@ void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
} }
} }
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
// Copy a Y to RGB. // Copy a Y to RGB.
int x; int x;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
@@ -986,38 +994,42 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
} }
} }
// YUV to RGB conversion constants. // BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias. // Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header. // TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// U and V contributions to R,G,B. // U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */ #define UB -128 /* max(-128, round(-2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */ #define UG 25 /* round(0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */ #define VG 52 /* round(0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */ #define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V. // Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB) #define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 - YGB) #define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 - YGB) #define BR (VR * 128 + YGB)
// C reference code that mimics the YUV assembly. // C reference code that mimics the YUV assembly.
static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) { uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(BB - ( u * UB) + y1) >> 6); *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
*g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6); *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
*r = Clamp((int32)(BR - (v * VR ) + y1) >> 6); *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
} }
// C reference code that mimics the YUV assembly. // C reference code that mimics the YUV assembly.
static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
*b = Clamp((int32)(y1 - YGB) >> 6); *b = Clamp((int32)(y1 + YGB) >> 6);
*g = Clamp((int32)(y1 - YGB) >> 6); *g = Clamp((int32)(y1 + YGB) >> 6);
*r = Clamp((int32)(y1 - YGB) >> 6); *r = Clamp((int32)(y1 + YGB) >> 6);
} }
#undef YG #undef YG
@@ -1030,6 +1042,46 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
#undef BG #undef BG
#undef BR #undef BR
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y - U * -1.77200
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YGBJ 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UBJ -113 /* round(-1.77200 * 64) */
#define UGJ 22 /* round(0.34414 * 64) */
#define VGJ 46 /* round(0.71414 * 64) */
#define VRJ -90 /* round(-1.40200 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BBJ (UBJ * 128 + YGBJ)
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
#define BRJ (VRJ * 128 + YGBJ)
// C reference code that mimics the YUV assembly.
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
*b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
*g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
*r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
}
#undef YGJ
#undef YGBJ
#undef UBJ
#undef UGJ
#undef VGJ
#undef VRJ
#undef BBJ
#undef BGJ
#undef BRJ
#if !defined(LIBYUV_DISABLE_NEON) && \ #if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly. // C mimic assembly.
@@ -1102,34 +1154,6 @@ void I422ToARGBRow_C(const uint8* src_y,
} }
} }
// C reference code that mimics the YUV assembly.
// * R = Y + 1.40200 * Cr
// * G = Y - 0.34414 * Cb - 0.71414 * Cr
// * B = Y + 1.77200 * Cb
#define YGJ 64 /* (int8)round(1.000 * 64) */
#define UBJ 113 /* (int8)round(1.772 * 64) */
#define UGJ -22 /* (int8)round(-0.34414 * 64) */
#define URJ 0
#define VBJ 0
#define VGJ -46 /* (int8)round(-0.71414 * 64) */
#define VRJ 90 /* (int8)round(1.402 * 64) */
// Bias
#define BBJ (UBJ * 128 + VBJ * 128)
#define BGJ (UGJ * 128 + VGJ * 128)
#define BRJ (URJ * 128 + VRJ * 128)
static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
uint8* b, uint8* g, uint8* r) {
uint32 y1 = (uint32)(y * YGJ);
*b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
*g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
*r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
}
void J422ToARGBRow_C(const uint8* src_y, void J422ToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@@ -1354,23 +1378,23 @@ void I411ToARGBRow_C(const uint8* src_y,
} }
void NV12ToARGBRow_C(const uint8* src_y, void NV12ToARGBRow_C(const uint8* src_y,
const uint8* usrc_v, const uint8* src_uv,
uint8* rgb_buf, uint8* rgb_buf,
int width) { int width) {
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], YuvPixel(src_y[0], src_uv[0], src_uv[1],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255; rgb_buf[3] = 255;
YuvPixel(src_y[1], usrc_v[0], usrc_v[1], YuvPixel(src_y[1], src_uv[0], src_uv[1],
rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
rgb_buf[7] = 255; rgb_buf[7] = 255;
src_y += 2; src_y += 2;
usrc_v += 2; src_uv += 2;
rgb_buf += 8; // Advance 2 pixels. rgb_buf += 8; // Advance 2 pixels.
} }
if (width & 1) { if (width & 1) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], YuvPixel(src_y[0], src_uv[0], src_uv[1],
rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
rgb_buf[3] = 255; rgb_buf[3] = 255;
} }
@@ -1402,7 +1426,7 @@ void NV21ToARGBRow_C(const uint8* src_y,
} }
void NV12ToRGB565Row_C(const uint8* src_y, void NV12ToRGB565Row_C(const uint8* src_y,
const uint8* usrc_v, const uint8* src_uv,
uint8* dst_rgb565, uint8* dst_rgb565,
int width) { int width) {
uint8 b0; uint8 b0;
@@ -1413,8 +1437,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
uint8 r1; uint8 r1;
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1); YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
b0 = b0 >> 3; b0 = b0 >> 3;
g0 = g0 >> 2; g0 = g0 >> 2;
r0 = r0 >> 3; r0 = r0 >> 3;
@@ -1424,11 +1448,11 @@ void NV12ToRGB565Row_C(const uint8* src_y,
*(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
(b1 << 16) | (g1 << 21) | (r1 << 27); (b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2; src_y += 2;
usrc_v += 2; src_uv += 2;
dst_rgb565 += 4; // Advance 2 pixels. dst_rgb565 += 4; // Advance 2 pixels.
} }
if (width & 1) { if (width & 1) {
YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
b0 = b0 >> 3; b0 = b0 >> 3;
g0 = g0 >> 2; g0 = g0 >> 2;
r0 = r0 >> 3; r0 = r0 >> 3;
@@ -1588,7 +1612,7 @@ void I422ToRGBARow_C(const uint8* src_y,
} }
} }
void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@@ -2062,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
} }
} }
// Select G channel from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
// Copy a row of G.
int x;
for (x = 0; x < pix - 1; x += 2) {
dst_bayer[0] = src_argb[1];
dst_bayer[1] = src_argb[5];
src_argb += 8;
dst_bayer += 2;
}
if (pix & 1) {
dst_bayer[0] = src_argb[1];
}
}
// Use first 4 shuffler values to reorder ARGB channels. // Use first 4 shuffler values to reorder ARGB channels.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
@@ -2120,7 +2128,7 @@ void I422ToYUY2Row_C(const uint8* src_y,
if (width & 1) { if (width & 1) {
dst_frame[0] = src_y[0]; dst_frame[0] = src_y[0];
dst_frame[1] = src_u[0]; dst_frame[1] = src_u[0];
dst_frame[2] = src_y[0]; // duplicate last y dst_frame[2] = 0;
dst_frame[3] = src_v[0]; dst_frame[3] = src_v[0];
} }
} }
@@ -2144,14 +2152,15 @@ void I422ToUYVYRow_C(const uint8* src_y,
dst_frame[0] = src_u[0]; dst_frame[0] = src_u[0];
dst_frame[1] = src_y[0]; dst_frame[1] = src_y[0];
dst_frame[2] = src_v[0]; dst_frame[2] = src_v[0];
dst_frame[3] = src_y[0]; // duplicate last y dst_frame[3] = 0;
} }
} }
// Maximum temporary width for wrappers to process at a time, in pixels. // Maximum temporary width for wrappers to process at a time, in pixels.
#define MAXTWIDTH 2048 #define MAXTWIDTH 2048
#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3) #if !(defined(_MSC_VER) && !defined(__clang__)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper. // row_win.cc has asm version, but GCC uses 2 step wrapper.
void I422ToRGB565Row_SSSE3(const uint8* src_y, void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
@@ -2346,6 +2355,50 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
} }
#endif #endif
#if defined(HAS_I422TORGB24ROW_AVX2)
void I422ToRGB24Row_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TORAWROW_AVX2)
void I422ToRAWRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_raw,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
// TODO(fbarchard): ARGBToRAWRow_AVX2
ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_raw += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_NV12TORGB565ROW_AVX2) #if defined(HAS_NV12TORGB565ROW_AVX2)
void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
uint8* dst_rgb565, int width) { uint8* dst_rgb565, int width) {

View File

@@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
} }
#endif // TESTING #endif // TESTING
#ifdef HAS_I400TOARGBROW_SSE2 #ifdef HAS_J400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile ( asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n" "pslld $0x18,%%xmm5 \n"
@@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
:: "memory", "cc", "xmm0", "xmm1", "xmm5" :: "memory", "cc", "xmm0", "xmm1", "xmm5"
); );
} }
#endif // HAS_I400TOARGBROW_SSE2 #endif // HAS_J400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
@@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUVJROW_SSSE3
// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) { uint8* dst_u, uint8* dst_v, int width) {
asm volatile ( asm volatile (
@@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
// YUV to RGB conversion constants.
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
// U and V contributions to R,G,B.
#define UB -128 /* -min(128, round(2.018 * 64)) */
#define UG 25 /* -round(-0.391 * 64) */
#define VG 52 /* -round(-0.813 * 64) */
#define VR -102 /* -round(1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 - YGB)
#define BG (UG * 128 + VG * 128 - YGB)
#define BR (VR * 128 - YGB)
struct YuvConstants { struct YuvConstants {
lvec8 kUVToB; // 0 lvec8 kUVToB; // 0
lvec8 kUVToG; // 32 lvec8 kUVToG; // 32
@@ -1440,6 +1423,27 @@ struct YuvConstants {
lvec16 kYToRgb; // 192 lvec16 kYToRgb; // 192
}; };
// BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 - U * -2.018
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.018 * 64)) */
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR -102 /* round(-1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// BT601 constants for YUV to RGB. // BT601 constants for YUV to RGB.
static YuvConstants SIMD_ALIGNED(kYuvConstants) = { static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
{ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
@@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
{ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
}; };
#undef YG
#undef YGB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y - U * -1.77200
// Y contribution to R,G,B. Scale and bias.
// TODO(fbarchard): Consider moving constants into a common header.
#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YGBJ 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UBJ -113 /* round(-1.77200 * 64) */
#define UGJ 22 /* round(0.34414 * 64) */
#define VGJ 46 /* round(0.71414 * 64) */
#define VRJ -90 /* round(-1.40200 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BBJ (UBJ * 128 + YGBJ)
#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
#define BRJ (VRJ * 128 + YGBJ)
// JPEG constants for YUV to RGB.
YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
{ UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
{ UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
{ 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
{ BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
{ BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
{ BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
{ YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
};
#undef YGJ
#undef YGBJ
#undef UBJ
#undef UGJ
#undef VGJ
#undef VRJ
#undef BBJ
#undef BGJ
#undef BRJ
// Read 8 UV from 411 // Read 8 UV from 411
#define READYUV444 \ #define READYUV444 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
@@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm2,%%xmm0 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \
"punpckhwd %%xmm2,%%xmm1 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
// Store 8 BGRA values. Assumes XMM5 is zero. // Store 8 BGRA values. Assumes XMM5 is zero.
#define STOREBGRA \ #define STOREBGRA \
@@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm1,%%xmm5 \n" \ "punpcklwd %%xmm1,%%xmm5 \n" \
"punpckhwd %%xmm1,%%xmm0 \n" \ "punpckhwd %%xmm1,%%xmm0 \n" \
"movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n"
// Store 8 ABGR values. Assumes XMM5 is zero. // Store 8 ABGR values. Assumes XMM5 is zero.
#define STOREABGR \ #define STOREABGR \
@@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm0,%%xmm2 \n" \ "punpcklwd %%xmm0,%%xmm2 \n" \
"punpckhwd %%xmm0,%%xmm1 \n" \ "punpckhwd %%xmm0,%%xmm1 \n" \
"movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n"
// Store 8 RGBA values. Assumes XMM5 is zero. // Store 8 RGBA values. Assumes XMM5 is zero.
#define STORERGBA \ #define STORERGBA \
@@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
"punpcklwd %%xmm1,%%xmm5 \n" \ "punpcklwd %%xmm1,%%xmm5 \n" \
"punpckhwd %%xmm1,%%xmm0 \n" \ "punpckhwd %%xmm1,%%xmm0 \n" \
"movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
@@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
); );
} }
void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422
YUVTORGB(kYuvConstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
@@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \
"vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
"vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
"vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \
"vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
"vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
"vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \
@@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
} }
#endif // HAS_I422TOARGBROW_AVX2 #endif // HAS_I422TOARGBROW_AVX2
#if defined(HAS_J422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* dst_argb,
int width) {
asm volatile (
"sub %[u_buf],%[v_buf] \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
READYUV422_AVX2
YUVTORGB_AVX2(kYuvConstants)
// Step 3: Weave into ARGB
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels
"vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels
"vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n"
"vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
"lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
"sub $0x10,%[width] \n"
"jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
);
}
#endif // HAS_J422TOARGBROW_AVX2
#if defined(HAS_I422TOABGRROW_AVX2) #if defined(HAS_I422TOABGRROW_AVX2)
// 16 pixels // 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
@@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
} }
#endif // HAS_I422TORGBAROW_AVX2 #endif // HAS_I422TORGBAROW_AVX2
#ifdef HAS_YTOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_SSE2
void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
"movd %%eax,%%xmm2 \n" "movd %%eax,%%xmm2 \n"
@@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
); );
} }
#endif // HAS_YTOARGBROW_SSE2 #endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_YTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates. // note: vpunpcklbw mutates and vpackuswb unmutates.
void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
"vmovd %%eax,%%xmm2 \n" "vmovd %%eax,%%xmm2 \n"
@@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
); );
} }
#endif // HAS_YTOARGBROW_AVX2 #endif // HAS_I400TOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes. // Shuffle table for reversing the bytes.
@@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"psllw $0x8,%%xmm5 \n" "psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n" "pslld $0x18,%%xmm4 \n"
"sub $0x1,%3 \n" "sub $0x4,%3 \n"
"je 91f \n"
"jl 99f \n"
// 1 pixel loop until destination pointer is aligned.
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
"movd " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd " MEMACCESS(1) ",%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
"add $1-4,%3 \n"
"jl 49f \n" "jl 49f \n"
// 4 pixel loop. // 4 pixel loop.
@@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"psllw $0x8,%%xmm5 \n" "psllw $0x8,%%xmm5 \n"
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n" "pslld $0x18,%%xmm4 \n"
"sub $0x1,%3 \n" "sub $0x4,%3 \n"
"je 91f \n"
"jl 99f \n"
// 1 pixel loop until destination pointer is aligned.
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
"movd " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movd " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
"add $1-4,%3 \n"
"jl 49f \n" "jl 49f \n"
// 4 pixel loop. // 4 pixel loop.
@@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
} }
#endif // HAS_INTERPOLATEROW_SSE2 #endif // HAS_INTERPOLATEROW_SSE2
#ifdef HAS_ARGBTOBAYERGGROW_SSE2
void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "memory", "cc"
, "xmm0", "xmm1", "xmm5"
);
}
#endif // HAS_ARGBTOBAYERGGROW_SSE2
#ifdef HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

View File

@@ -94,11 +94,17 @@ extern "C" {
"vtrn.u32 d2, d3 \n" "vtrn.u32 d2, d3 \n"
#define YUV422TORGB_SETUP_REG \ #define YUV422TORGB_SETUP_REG \
MEMACCESS([kUVToRB]) \
"vld1.8 {d24}, [%[kUVToRB]] \n" \ "vld1.8 {d24}, [%[kUVToRB]] \n" \
MEMACCESS([kUVToG]) \
"vld1.8 {d25}, [%[kUVToG]] \n" \ "vld1.8 {d25}, [%[kUVToG]] \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
MEMACCESS([kUVBiasBGR]) \
"vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
MEMACCESS([kYToRgb]) \
"vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
#define YUV422TORGB \ #define YUV422TORGB \
@@ -186,7 +192,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -216,7 +222,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -246,7 +252,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -277,7 +283,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -308,7 +314,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -338,7 +344,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -367,7 +373,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -397,7 +403,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -439,7 +445,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -485,7 +491,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -526,14 +532,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %6 [kUVToG]"r"(&kUVToG), // %6
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
void YToARGBRow_NEON(const uint8* src_y, void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
".p2align 2 \n" ".p2align 2 \n"
@@ -552,17 +558,17 @@ void YToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %4 [kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
void I400ToARGBRow_NEON(const uint8* src_y, void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
".p2align 2 \n"
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
".p2align 2 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
@@ -603,7 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5 [kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -631,7 +637,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5 [kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -659,7 +665,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5 [kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -687,7 +693,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
[kUVToG]"r"(&kUVToG), // %5 [kUVToG]"r"(&kUVToG), // %5
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -713,7 +719,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
[kUVToG]"r"(&kUVToG), // %4 [kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -739,7 +745,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
[kUVToG]"r"(&kUVToG), // %4 [kUVToG]"r"(&kUVToG), // %4
[kUVBiasBGR]"r"(&kUVBiasBGR), [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "q0", "q1", "q2", "q3", : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
); );
} }
@@ -1245,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
); );
} }
// Select G channels from ARGB. e.g. GGGGGGGG
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "cc", "memory", "q0", "q1" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) { const uint8* shuffler, int pix) {
@@ -1360,6 +1347,30 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
); );
} }
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
".p2align 2 \n"
"vdup.32 d2, %2 \n" // dither4
"1: \n"
MEMACCESS(1)
"vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d20, d20, d2 \n"
"vqadd.u8 d21, d21, d2 \n"
"vqadd.u8 d22, d22, d2 \n"
ARGBTORGB565
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
);
}
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) { int pix) {
asm volatile ( asm volatile (

View File

@@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV444 READYUV444
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV411 READYUV411
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v21, v22, v23) YUV422TORGB(v21, v22, v23)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" /* A */ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v23, v22, v21) YUV422TORGB(v23, v22, v21)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v20.8b, #255 \n" /* A */ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3) MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
@@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
@@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v20, v21, v22) YUV422TORGB(v20, v21, v22)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
MEMACCESS(3) MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n" "b.gt 1b \n"
@@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(3) MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
@@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(3) MEMACCESS(3)
@@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READYUV422 READYUV422
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %4, %4, #8 \n" "subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(3) MEMACCESS(3)
@@ -513,33 +513,34 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
} }
#endif // HAS_I422TOARGB4444ROW_NEON #endif // HAS_I422TOARGB4444ROW_NEON
#ifdef HAS_YTOARGBROW_NEON #ifdef HAS_I400TOARGBROW_NEON
void YToARGBRow_NEON(const uint8* src_y, void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READYUV400 READYUV400
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR), : [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
); );
} }
#endif // HAS_YTOARGBROW_NEON #endif // HAS_I400TOARGBROW_NEON
#ifdef HAS_I400TOARGBROW_NEON #ifdef HAS_J400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y, void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
asm volatile ( asm volatile (
@@ -549,7 +550,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
"ld1 {v20.8b}, [%0], #8 \n" "ld1 {v20.8b}, [%0], #8 \n"
"orr v21.8b, v20.8b, v20.8b \n" "orr v21.8b, v20.8b, v20.8b \n"
"orr v22.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n"
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
@@ -560,7 +561,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
: "cc", "memory", "v20", "v21", "v22", "v23" : "cc", "memory", "v20", "v21", "v22", "v23"
); );
} }
#endif // HAS_I400TOARGBROW_NEON #endif // HAS_J400TOARGBROW_NEON
#ifdef HAS_NV12TOARGBROW_NEON #ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y, void NV12ToARGBRow_NEON(const uint8* src_y,
@@ -572,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
@@ -599,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(2) MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
@@ -626,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV12 READNV12
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
@@ -653,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"1: \n" "1: \n"
READNV21 READNV21
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
ARGBTORGB565 ARGBTORGB565
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
@@ -674,19 +675,20 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
void YUY2ToARGBRow_NEON(const uint8* src_yuy2, void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READYUY2 READYUY2
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR), : [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@@ -699,19 +701,20 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
void UYVYToARGBRow_NEON(const uint8* src_uyvy, void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb, uint8* dst_argb,
int width) { int width) {
int64 width64 = (int64)(width);
asm volatile ( asm volatile (
YUV422TORGB_SETUP_REG YUV422TORGB_SETUP_REG
"1: \n" "1: \n"
READUYVY READUYVY
YUV422TORGB(v22, v21, v20) YUV422TORGB(v22, v21, v20)
"subs %2, %2, #8 \n" "subs %w2, %w2, #8 \n"
"movi v23.8b, #255 \n" "movi v23.8b, #255 \n"
MEMACCESS(1) MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width64) // %2
: [kUVBiasBGR]"r"(&kUVBiasBGR), : [kUVBiasBGR]"r"(&kUVBiasBGR),
[kYToRgb]"r"(&kYToRgb) [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
@@ -728,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v0.16b}, [%1], #16 \n" // store U
MEMACCESS(2) MEMACCESS(2)
@@ -754,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v0.16b}, [%0], #16 \n" // load U
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load V "ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
MEMACCESS(2) MEMACCESS(2)
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n" "b.gt 1b \n"
@@ -776,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %w2, %w2, #32 \n" // 32 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
"b.gt 1b \n" "b.gt 1b \n"
@@ -794,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %w1, %w1, #16 \n" // 16 bytes per loop
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
@@ -809,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile ( asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints "dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #4 \n" // 4 ints per loop "subs %w1, %w1, #4 \n" // 4 ints per loop
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store "st1 {v0.16b}, [%0], #16 \n" // store
"b.gt 1b \n" "b.gt 1b \n"
@@ -822,6 +825,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
#ifdef HAS_MIRRORROW_NEON #ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
int64 width64 = (int64) width;
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
@@ -830,7 +834,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
"subs %2, %2, #16 \n" // 16 pixels per loop. "subs %2, %2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n" "rev64 v0.16b, v0.16b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@@ -839,7 +843,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3 : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0" : "cc", "memory", "v0"
); );
@@ -849,6 +853,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORUVROW_NEON #ifdef HAS_MIRRORUVROW_NEON
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) { int width) {
int64 width64 = (int64) width;
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
@@ -868,7 +873,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 "+r"(width64) // %3
: "r"((ptrdiff_t)-16) // %4 : "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "v0", "v1" : "cc", "memory", "v0", "v1"
); );
@@ -877,6 +882,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
#ifdef HAS_ARGBMIRRORROW_NEON #ifdef HAS_ARGBMIRRORROW_NEON
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
int64 width64 = (int64) width;
asm volatile ( asm volatile (
// Start at end of source row. // Start at end of source row.
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
@@ -894,7 +900,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3 : "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0" : "cc", "memory", "v0"
); );
@@ -908,7 +914,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
"b.gt 1b \n" "b.gt 1b \n"
@@ -928,7 +934,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g "orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r "orr v4.8b, v0.8b, v0.8b \n" // move r
MEMACCESS(1) MEMACCESS(1)
@@ -963,7 +969,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@@ -1022,7 +1028,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@@ -1055,7 +1061,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
MEMACCESS(1) MEMACCESS(1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
@@ -1075,7 +1081,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
"b.gt 1b \n" "b.gt 1b \n"
@@ -1094,7 +1100,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g "orr v4.8b, v2.8b, v2.8b \n" // mov g
"orr v5.8b, v1.8b, v1.8b \n" // mov b "orr v5.8b, v1.8b, v1.8b \n" // mov b
MEMACCESS(1) MEMACCESS(1)
@@ -1115,7 +1121,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n" "b.gt 1b \n"
@@ -1134,7 +1140,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"b.gt 1b \n" "b.gt 1b \n"
@@ -1154,7 +1160,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
@@ -1177,7 +1183,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
MEMACCESS(2) MEMACCESS(2)
@@ -1201,7 +1207,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
@@ -1231,7 +1237,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
@@ -1253,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
} }
#endif // HAS_UYVYTOUVROW_NEON #endif // HAS_UYVYTOUVROW_NEON
// Select G channels from ARGB. e.g. GGGGGGGG
#ifdef HAS_ARGBTOBAYERGGROW_NEON
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop
MEMACCESS(1)
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
"+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#endif // HAS_ARGBTOBAYERGGROW_NEON
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
#ifdef HAS_ARGBSHUFFLEROW_NEON #ifdef HAS_ARGBSHUFFLEROW_NEON
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
@@ -1284,7 +1269,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store 4. "st1 {v1.16b}, [%1], #16 \n" // store 4.
@@ -1312,7 +1297,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
@@ -1341,7 +1326,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
MEMACCESS(2) MEMACCESS(2)
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels
MEMACCESS(3) MEMACCESS(3)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n" "b.gt 1b \n"
@@ -1362,7 +1347,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565 ARGBTORGB565
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
@@ -1376,6 +1361,31 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
} }
#endif // HAS_ARGBTORGB565ROW_NEON #endif // HAS_ARGBTORGB565ROW_NEON
#ifdef HAS_ARGBTORGB565DITHERROW_NEON
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
"dup v1.4s, %w2 \n" // dither4
"1: \n"
MEMACCESS(1)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v20.8b, v20.8b, v1.8b \n"
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n"
ARGBTORGB565
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
"r"(dither4), // %2
"r"(width) // %3
: "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
);
}
#endif // HAS_ARGBTORGB565ROW_NEON
#ifdef HAS_ARGBTOARGB1555ROW_NEON #ifdef HAS_ARGBTOARGB1555ROW_NEON
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) { int pix) {
@@ -1383,7 +1393,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555 ARGBTOARGB1555
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
@@ -1405,7 +1415,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444 ARGBTOARGB4444
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
@@ -1429,7 +1439,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
@@ -1456,7 +1466,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
@@ -1487,7 +1497,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R "umlsl v4.8h, v2.8b, v26.8b \n" // R
@@ -1531,7 +1541,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %w3, %w3, #16 \n" // 16 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B "mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R "mls v3.8h, v2.8h, v22.8h \n" // R
@@ -1587,7 +1597,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %3, %3, #32 \n" // 32 processed per loop. "subs %w3, %w3, #32 \n" // 32 processed per loop.
"mul v3.8h, v0.8h, v20.8h \n" // B "mul v3.8h, v0.8h, v20.8h \n" // B
"mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v1.8h, v21.8h \n" // G
"mls v3.8h, v2.8h, v22.8h \n" // R "mls v3.8h, v2.8h, v22.8h \n" // R
@@ -1653,7 +1663,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1700,7 +1710,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1741,7 +1751,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"urshr v1.8h, v3.8h, #1 \n" "urshr v1.8h, v3.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1782,7 +1792,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h) RGBTOUV(v0.8h, v2.8h, v1.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1823,7 +1833,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1864,7 +1874,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h) RGBTOUV(v0.8h, v1.8h, v2.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1905,7 +1915,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"urshr v1.8h, v1.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n"
"urshr v0.8h, v0.8h, #1 \n" "urshr v0.8h, v0.8h, #1 \n"
"subs %4, %4, #16 \n" // 32 processed per loop. "subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h) RGBTOUV(v2.8h, v1.8h, v0.8h)
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
@@ -1971,7 +1981,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"urshr v5.8h, v18.8h, #1 \n" "urshr v5.8h, v18.8h, #1 \n"
"urshr v6.8h, v20.8h, #1 \n" "urshr v6.8h, v20.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v16.8h, v4.8h, v22.8h \n" // B "mul v16.8h, v4.8h, v22.8h \n" // B
"mls v16.8h, v5.8h, v23.8h \n" // G "mls v16.8h, v5.8h, v23.8h \n" // G
"mls v16.8h, v6.8h, v24.8h \n" // R "mls v16.8h, v6.8h, v24.8h \n" // R
@@ -2042,7 +2052,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"urshr v5.8h, v17.8h, #1 \n" "urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B "mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R "mls v2.8h, v6.8h, v22.8h \n" // R
@@ -2113,7 +2123,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"urshr v5.8h, v17.8h, #1 \n" "urshr v5.8h, v17.8h, #1 \n"
"urshr v6.8h, v18.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop. "subs %w4, %w4, #16 \n" // 16 processed per loop.
"mul v2.8h, v4.8h, v20.8h \n" // B "mul v2.8h, v4.8h, v20.8h \n" // B
"mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v5.8h, v21.8h \n" // G
"mls v2.8h, v6.8h, v22.8h \n" // R "mls v2.8h, v6.8h, v22.8h \n" // R
@@ -2153,7 +2163,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB RGB565TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
@@ -2183,7 +2193,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB ARGB1555TOARGB
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
@@ -2212,7 +2222,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB ARGB4444TOARGB
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
@@ -2241,7 +2251,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B "umlal v16.8h, v3.8b, v6.8b \n" // B
@@ -2269,7 +2279,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B "umlal v16.8h, v2.8b, v6.8b \n" // B
@@ -2297,7 +2307,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v1.8b, v4.8b \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R "umlal v16.8h, v3.8b, v6.8b \n" // R
@@ -2325,7 +2335,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
@@ -2353,7 +2363,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
@@ -2380,13 +2390,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
int y0_fraction = 256 - y1_fraction; int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %w4, #0 \n"
"b.eq 100f \n" "b.eq 100f \n"
"cmp %4, #64 \n" "cmp %w4, #64 \n"
"b.eq 75f \n" "b.eq 75f \n"
"cmp %4, #128 \n" "cmp %w4, #128 \n"
"b.eq 50f \n" "b.eq 50f \n"
"cmp %4, #192 \n" "cmp %w4, #192 \n"
"b.eq 25f \n" "b.eq 25f \n"
"dup v5.16b, %w4 \n" "dup v5.16b, %w4 \n"
@@ -2397,7 +2407,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v2.8h, v0.8b, v4.8b \n" "umull v2.8h, v0.8b, v4.8b \n"
"umull2 v3.8h, v0.16b, v4.16b \n" "umull2 v3.8h, v0.16b, v4.16b \n"
"umlal v2.8h, v1.8b, v5.8b \n" "umlal v2.8h, v1.8b, v5.8b \n"
@@ -2415,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
@@ -2429,7 +2439,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
@@ -2442,7 +2452,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
@@ -2454,7 +2464,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
@@ -2477,7 +2487,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) { uint8* dst_argb, int width) {
asm volatile ( asm volatile (
"subs %3, %3, #8 \n" "subs %w3, %w3, #8 \n"
"b.lt 89f \n" "b.lt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
"8: \n" "8: \n"
@@ -2485,7 +2495,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
@@ -2504,7 +2514,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"b.ge 8b \n" "b.ge 8b \n"
"89: \n" "89: \n"
"adds %3, %3, #8-1 \n" "adds %w3, %w3, #8-1 \n"
"b.lt 99f \n" "b.lt 99f \n"
// Blend 1 pixels. // Blend 1 pixels.
@@ -2513,7 +2523,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop. "subs %w3, %w3, #1 \n" // 1 processed per loop.
"umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v16.8h, v4.8b, v3.8b \n" // db * a
"umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a
"umull v18.8h, v6.8b, v3.8b \n" // dr * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a
@@ -2552,7 +2562,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v4.8h, v0.8b, v3.8b \n" // b * a
"umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a "umull v6.8h, v2.8b, v3.8b \n" // r * a
@@ -2586,7 +2596,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n" "uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n" "uxtl v2.8h, v2.8b \n"
@@ -2630,7 +2640,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
"uxtl v5.8h, v5.8b \n" "uxtl v5.8h, v5.8b \n"
"uxtl v6.8h, v6.8b \n" "uxtl v6.8h, v6.8b \n"
@@ -2667,7 +2677,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R "umlal v4.8h, v2.8b, v26.8b \n" // R
@@ -2706,7 +2716,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop. "subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R "umlal v4.8h, v2.8b, v22.8b \n" // R
@@ -2746,7 +2756,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
"uxtl v17.8h, v17.8b \n" // g "uxtl v17.8h, v17.8b \n" // g
"uxtl v18.8h, v18.8b \n" // r "uxtl v18.8h, v18.8b \n" // r
@@ -2808,7 +2818,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v0.8h, v0.8b, v4.8b \n" // multiply B
"umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v1.8h, v1.8b, v5.8b \n" // multiply G
"umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v2.8h, v2.8b, v6.8b \n" // multiply R
@@ -2842,7 +2852,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v0.8b, v0.8b, v4.8b \n"
"uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n"
"uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n"
@@ -2872,7 +2882,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
MEMACCESS(1) MEMACCESS(1)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v0.8b, v0.8b, v4.8b \n"
"uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n"
"uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n"
@@ -2907,7 +2917,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v0.8b, v0.8b, v1.8b \n" // add "uqadd v0.8b, v0.8b, v1.8b \n" // add
"orr v1.8b, v0.8b, v0.8b \n" "orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n"
@@ -2935,7 +2945,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop. "subs %w3, %w3, #16 \n" // 16 processed per loop.
"uqadd v0.16b, v0.16b, v1.16b \n" // add "uqadd v0.16b, v0.16b, v1.16b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
@@ -2966,7 +2976,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add "uqadd v1.8b, v0.8b, v2.8b \n" // add
MEMACCESS(2) MEMACCESS(2)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
@@ -3006,7 +3016,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v2.8b}, [%2],%5 \n" // bottom
MEMACCESS(2) MEMACCESS(2)
"ld1 {v3.8b}, [%2],%6 \n" "ld1 {v3.8b}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels "subs %w4, %w4, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n" "usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n" "abs v0.8h, v0.8h \n"
@@ -3019,8 +3029,8 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(src_y2), // %2 "+r"(src_y2), // %2
"+r"(dst_sobelx), // %3 "+r"(dst_sobelx), // %3
"+r"(width) // %4 "+r"(width) // %4
: "r"(2), // %5 : "r"(2LL), // %5
"r"(6) // %6 "r"(6LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
@@ -3051,7 +3061,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v2.8b}, [%0],%5 \n" // right
MEMACCESS(1) MEMACCESS(1)
"ld1 {v3.8b}, [%1],%5 \n" "ld1 {v3.8b}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels "subs %w3, %w3, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n" "usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n"
"abs v0.8h, v0.8h \n" "abs v0.8h, v0.8h \n"
@@ -3063,8 +3073,8 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
"+r"(src_y1), // %1 "+r"(src_y1), // %1
"+r"(dst_sobely), // %2 "+r"(dst_sobely), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"(1), // %4 : "r"(1LL), // %4
"r"(6) // %5 "r"(6LL) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }

File diff suppressed because it is too large Load Diff

View File

@@ -23,9 +23,6 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// Remove this macro if OVERREAD is safe.
#define AVOID_OVERREAD 1
static __inline int Abs(int v) { static __inline int Abs(int v) {
return v >= 0 ? v : -v; return v >= 0 ? v : -v;
} }
@@ -44,9 +41,8 @@ static void ScalePlaneDown2(int src_width, int src_height,
int y; int y;
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) = uint8* dst_ptr, int dst_width) =
filtering == kFilterNone ? ScaleRowDown2_C : filtering == kFilterNone ? ScaleRowDown2_C :
(filtering == kFilterLinear ? ScaleRowDown2Linear_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
ScaleRowDown2Box_C);
int row_stride = src_stride << 1; int row_stride = src_stride << 1;
if (!filtering) { if (!filtering) {
src_ptr += src_stride; // Point to odd rows. src_ptr += src_stride; // Point to odd rows.
@@ -54,15 +50,39 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
#if defined(HAS_SCALEROWDOWN2_NEON) #if defined(HAS_SCALEROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
ScaleRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
(filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
ScaleRowDown2Box_NEON);
}
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_SSE2) #if defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
ScaleRowDown2Box_SSE2); ScaleRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
ScaleRowDown2Box_SSE2);
}
}
#endif
#if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
ScaleRowDown2Box_Any_AVX2);
if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
ScaleRowDown2Box_AVX2);
}
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@@ -154,13 +174,30 @@ static void ScalePlaneDown4(int src_width, int src_height,
src_stride = 0; src_stride = 0;
} }
#if defined(HAS_SCALEROWDOWN4_NEON) #if defined(HAS_SCALEROWDOWN4_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
}
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_SSE2) #if defined(HAS_SCALEROWDOWN4_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
if (IS_ALIGNED(dst_width, 8)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
}
}
#endif
#if defined(HAS_SCALEROWDOWN4_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowDown4 = filtering ?
ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
if (IS_ALIGNED(dst_width, 16)) {
ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
}
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) #if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
@@ -249,24 +286,42 @@ static void ScalePlaneDown34(int src_width, int src_height,
ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
} }
#if defined(HAS_SCALEROWDOWN34_NEON) #if defined(HAS_SCALEROWDOWN34_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { if (TestCpuFlag(kCpuHasNEON)) {
if (!filtering) { if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON; ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON; ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
} else { } else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
}
if (dst_width % 24 == 0) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_NEON;
ScaleRowDown34_1 = ScaleRowDown34_NEON;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
}
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN34_SSSE3) #if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) { if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
} else { } else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
}
if (dst_width % 24 == 0) {
if (!filtering) {
ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
} else {
ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
}
} }
} }
#endif #endif
@@ -422,23 +477,41 @@ static void ScalePlaneDown38(int src_width, int src_height,
ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
} }
#if defined(HAS_SCALEROWDOWN38_NEON) #if defined(HAS_SCALEROWDOWN38_NEON)
if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { if (TestCpuFlag(kCpuHasNEON)) {
if (!filtering) { if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON; ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON; ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
} else { } else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
}
if (dst_width % 12 == 0) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_NEON;
ScaleRowDown38_2 = ScaleRowDown38_NEON;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
}
} }
} }
#endif #endif
#if defined(HAS_SCALEROWDOWN38_SSSE3) #if defined(HAS_SCALEROWDOWN38_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) { if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
}
if (dst_width % 12 == 0 && !filtering) {
ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
} else { }
if (dst_width % 6 == 0 && filtering) {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
} }
@@ -559,65 +632,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
} }
} }
static __inline uint32 SumBox(int iboxwidth, int iboxheight, #define MIN1(x) ((x) < 1 ? 1 : (x))
ptrdiff_t src_stride, const uint8* src_ptr) {
uint32 sum = 0u;
int y;
assert(iboxwidth > 0);
assert(iboxheight > 0);
for (y = 0; y < iboxheight; ++y) {
int x;
for (x = 0; x < iboxwidth; ++x) {
sum += src_ptr[x];
}
src_ptr += src_stride;
}
return sum;
}
static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,
ptrdiff_t src_stride, const uint16* src_ptr) {
uint32 sum = 0u;
int y;
assert(iboxwidth > 0);
assert(iboxheight > 0);
for (y = 0; y < iboxheight; ++y) {
int x;
for (x = 0; x < iboxwidth; ++x) {
sum += src_ptr[x];
}
src_ptr += src_stride;
}
return sum;
}
static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
int x, int dx, ptrdiff_t src_stride,
const uint8* src_ptr, uint8* dst_ptr) {
int i;
int boxwidth;
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
*dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
(boxwidth * boxheight);
}
}
static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,
int x, int dx, ptrdiff_t src_stride,
const uint16* src_ptr, uint16* dst_ptr) {
int i;
int boxwidth;
for (i = 0; i < dst_width; ++i) {
int ix = x >> 16;
x += dx;
boxwidth = (x >> 16) - ix;
*dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /
(boxwidth * boxheight);
}
}
static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
uint32 sum = 0u; uint32 sum = 0u;
@@ -643,15 +658,15 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) { const uint16* src_ptr, uint8* dst_ptr) {
int i; int i;
int scaletbl[2]; int scaletbl[2];
int minboxwidth = (dx >> 16); int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth; int* scaleptr = scaletbl - minboxwidth;
int boxwidth; int boxwidth;
scaletbl[0] = 65536 / (minboxwidth * boxheight); scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
for (i = 0; i < dst_width; ++i) { for (i = 0; i < dst_width; ++i) {
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
boxwidth = (x >> 16) - ix; boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
} }
} }
@@ -660,25 +675,36 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) { const uint32* src_ptr, uint16* dst_ptr) {
int i; int i;
int scaletbl[2]; int scaletbl[2];
int minboxwidth = (dx >> 16); int minboxwidth = dx >> 16;
int* scaleptr = scaletbl - minboxwidth; int* scaleptr = scaletbl - minboxwidth;
int boxwidth; int boxwidth;
scaletbl[0] = 65536 / (minboxwidth * boxheight); scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
for (i = 0; i < dst_width; ++i) { for (i = 0; i < dst_width; ++i) {
int ix = x >> 16; int ix = x >> 16;
x += dx; x += dx;
boxwidth = (x >> 16) - ix; boxwidth = MIN1((x >> 16) - ix);
*dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * *dst_ptr++ =
scaleptr[boxwidth] >> 16; SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
}
}
static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
const uint16* src_ptr, uint8* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
} }
} }
static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) { const uint16* src_ptr, uint8* dst_ptr) {
int boxwidth = (dx >> 16); int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight); int scaleval = 65536 / (boxwidth * boxheight);
int i; int i;
x >>= 16;
for (i = 0; i < dst_width; ++i) { for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
x += boxwidth; x += boxwidth;
@@ -687,7 +713,7 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) { const uint32* src_ptr, uint16* dst_ptr) {
int boxwidth = (dx >> 16); int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight); int scaleval = 65536 / (boxwidth * boxheight);
int i; int i;
for (i = 0; i < dst_width; ++i) { for (i = 0; i < dst_width; ++i) {
@@ -707,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) { const uint8* src_ptr, uint8* dst_ptr) {
int j; int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point. // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0; int x = 0;
int y = 0; int y = 0;
@@ -717,10 +743,40 @@ static void ScalePlaneBox(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1. {
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { // Allocate a row buffer of uint16.
uint8* dst = dst_ptr; align_buffer_64(row16, src_width * 2);
int j; void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C:
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
ScaleAddRow_C;
#if defined(HAS_SCALEADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleAddRow = ScaleAddRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_SSE2;
}
}
#endif
#if defined(HAS_SCALEADDROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleAddRow = ScaleAddRow_Any_AVX2;
if (IS_ALIGNED(src_width, 32)) {
ScaleAddRow = ScaleAddRow_AVX2;
}
}
#endif
#if defined(HAS_SCALEADDROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleAddRow = ScaleAddRow_Any_NEON;
if (IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_NEON;
}
}
#endif
for (j = 0; j < dst_height; ++j) { for (j = 0; j < dst_height; ++j) {
int boxheight; int boxheight;
int iy = y >> 16; int iy = y >> 16;
@@ -729,46 +785,13 @@ static void ScalePlaneBox(int src_width, int src_height,
if (y > max_y) { if (y > max_y) {
y = max_y; y = max_y;
} }
boxheight = (y >> 16) - iy; boxheight = MIN1((y >> 16) - iy);
ScalePlaneBoxRow_C(dst_width, boxheight, memset(row16, 0, src_width * 2);
x, dx, src_stride, for (k = 0; k < boxheight; ++k) {
src, dst); ScaleAddRow(src, (uint16 *)(row16), src_width);
dst += dst_stride; src += src_stride;
}
return;
}
{
// Allocate a row buffer of uint16.
align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint16* src_ptr, uint8* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
#if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
&& IS_ALIGNED(src_width, 16)
#endif
) {
ScaleAddRows = ScaleAddRows_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
const uint8* src = src_ptr + iy * src_stride;
y += dy;
if (y > (src_height << 16)) {
y = (src_height << 16);
} }
boxheight = (y >> 16) - iy; ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
ScaleAddRows(src, src_stride, (uint16*)(row16),
src_width, boxheight);
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),
dst_ptr);
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
free_aligned_buffer_64(row16); free_aligned_buffer_64(row16);
@@ -779,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
int dst_width, int dst_height, int dst_width, int dst_height,
int src_stride, int dst_stride, int src_stride, int dst_stride,
const uint16* src_ptr, uint16* dst_ptr) { const uint16* src_ptr, uint16* dst_ptr) {
int j; int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point. // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0; int x = 0;
int y = 0; int y = 0;
@@ -789,10 +812,21 @@ static void ScalePlaneBox_16(int src_width, int src_height,
ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
// TODO(fbarchard): Remove this and make AddRows handle boxheight 1. {
if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { // Allocate a row buffer of uint32.
uint16* dst = dst_ptr; align_buffer_64(row32, src_width * 4);
int j; void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
ScaleAddRow_16_C;
#if defined(HAS_SCALEADDROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
ScaleAddRow = ScaleAddRow_16_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) { for (j = 0; j < dst_height; ++j) {
int boxheight; int boxheight;
int iy = y >> 16; int iy = y >> 16;
@@ -801,46 +835,13 @@ static void ScalePlaneBox_16(int src_width, int src_height,
if (y > max_y) { if (y > max_y) {
y = max_y; y = max_y;
} }
boxheight = (y >> 16) - iy; boxheight = MIN1((y >> 16) - iy);
ScalePlaneBoxRow_16_C(dst_width, boxheight, memset(row32, 0, src_width * 4);
x, dx, src_stride, for (k = 0; k < boxheight; ++k) {
src, dst); ScaleAddRow(src, (uint32 *)(row32), src_width);
dst += dst_stride; src += src_stride;
}
return;
}
{
// Allocate a row buffer of uint32.
align_buffer_64(row32, src_width * 4);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
const uint32* src_ptr, uint16* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
#if defined(HAS_SCALEADDROWS_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2)
#ifdef AVOID_OVERREAD
&& IS_ALIGNED(src_width, 16)
#endif
) {
ScaleAddRows = ScaleAddRows_16_SSE2;
}
#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
const uint16* src = src_ptr + iy * src_stride;
y += dy;
if (y > (src_height << 16)) {
y = (src_height << 16);
} }
boxheight = (y >> 16) - iy; ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
ScaleAddRows(src, src_stride, (uint32*)(row32),
src_width, boxheight);
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),
dst_ptr);
dst_ptr += dst_stride; dst_ptr += dst_stride;
} }
free_aligned_buffer_64(row32); free_aligned_buffer_64(row32);
@@ -920,6 +921,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3; ScaleFilterCols = ScaleFilterCols_SSSE3;
} }
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif #endif
if (y > max_y) { if (y > max_y) {
y = max_y; y = max_y;
@@ -1057,8 +1066,8 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C; InterpolateRow_C;
void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) = int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C; filtering ? ScaleFilterCols_C : ScaleCols_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
@@ -1111,6 +1120,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_SSSE3; ScaleFilterCols = ScaleFilterCols_SSSE3;
} }
#endif
#if defined(HAS_SCALEFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
ScaleFilterCols = ScaleFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleFilterCols = ScaleFilterCols_NEON;
}
}
#endif #endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) { if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C; ScaleFilterCols = ScaleColsUp2_C;
@@ -1129,7 +1146,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
const uint8* src = src_ptr + yi * src_stride; const uint8* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers. // Allocate 2 row buffers.
const int kRowSize = (dst_width + 15) & ~15; const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row; uint8* rowptr = row;
@@ -1188,8 +1205,8 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_16_C; InterpolateRow_16_C;
void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
int dst_width, int x, int dx) = int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
@@ -1260,7 +1277,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
const uint16* src = src_ptr + yi * src_stride; const uint16* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers. // Allocate 2 row buffers.
const int kRowSize = (dst_width + 15) & ~15; const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 4); align_buffer_64(row, kRowSize * 4);
uint16* rowptr = (uint16*)row; uint16* rowptr = (uint16*)row;
@@ -1334,8 +1351,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
} }
for (i = 0; i < dst_height; ++i) { for (i = 0; i < dst_height; ++i) {
ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
dst_width, x, dx);
dst_ptr += dst_stride; dst_ptr += dst_stride;
y += dy; y += dy;
} }
@@ -1385,8 +1401,7 @@ void ScalePlane(const uint8* src, int src_stride,
enum FilterMode filtering) { enum FilterMode filtering) {
// Simplify filtering when possible. // Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height, dst_width, dst_height, filtering);
filtering);
// Negative height means invert the image. // Negative height means invert the image.
if (src_height < 0) { if (src_height < 0) {
@@ -1402,9 +1417,9 @@ void ScalePlane(const uint8* src, int src_stride,
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
return; return;
} }
if (dst_width == src_width) { if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height); int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically. // Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical(src_height, ScalePlaneVertical(src_height,
dst_width, dst_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, src_stride, dst_stride, src, dst,
@@ -1435,7 +1450,7 @@ void ScalePlane(const uint8* src, int src_stride,
return; return;
} }
if (4 * dst_width == src_width && 4 * dst_height == src_height && if (4 * dst_width == src_width && 4 * dst_height == src_height &&
filtering != kFilterBilinear) { (filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4 // optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height, ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering); src_stride, dst_stride, src, dst, filtering);
@@ -1469,8 +1484,7 @@ void ScalePlane_16(const uint16* src, int src_stride,
enum FilterMode filtering) { enum FilterMode filtering) {
// Simplify filtering when possible. // Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, filtering = ScaleFilterReduce(src_width, src_height,
dst_width, dst_height, dst_width, dst_height, filtering);
filtering);
// Negative height means invert the image. // Negative height means invert the image.
if (src_height < 0) { if (src_height < 0) {
@@ -1563,6 +1577,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@@ -1594,6 +1609,7 @@ int I420Scale_16(const uint16* src_y, int src_stride_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }

200
third_party/libyuv/source/scale_any.cc vendored Normal file
View File

@@ -0,0 +1,200 @@
/*
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
int dst_width, int x, int dx) { \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, \
dst_width & MASK, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C, 4, 3)
#endif
#undef CANY
// Fixed scale down.
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEROWDOWN2_SSE2
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31)
#endif
#ifdef HAS_SCALEROWDOWN2_NEON
SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
ScaleRowDown2Linear_C, 2, 1, 15)
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
ScaleRowDown2Box_C, 2, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_SSE2
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN4_AVX2
SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
4, 1, 15)
#endif
#ifdef HAS_SCALEROWDOWN4_NEON
SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
4, 1, 7)
#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
ScaleRowDown34_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
#endif
#ifdef HAS_SCALEROWDOWN38_NEON
SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
ScaleRowDown38_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
ScaleARGBRowDown2_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
ScaleARGBRowDown2Linear_C, 2, 4, 3)
SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
ScaleARGBRowDown2Box_C, 2, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWN2_NEON
SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
ScaleARGBRowDown2_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
ScaleARGBRowDown2Linear_C, 2, 4, 7)
SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
ScaleARGBRowDown2Box_C, 2, 4, 7)
#endif
#undef SDANY
// Scale down by even scale factor.
#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
uint8* dst_ptr, int dst_width) { \
int r = (int)((unsigned int)dst_width % (MASK + 1)); \
int n = dst_width - r; \
if (n > 0) { \
SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
} \
SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \
src_stepx, dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
ScaleARGBRowDownEven_C, 4, 3)
SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
ScaleARGBRowDownEvenBox_C, 4, 3)
#endif
// Add rows box filter scale down.
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
int n = src_width & ~MASK; \
if (n > 0) { \
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
} \
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
#endif
#ifdef HAS_SCALEADDROW_AVX2
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
#endif
#ifdef HAS_SCALEADDROW_NEON
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
#endif
#undef SAANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif

View File

@@ -53,16 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height,
} }
#if defined(HAS_SCALEARGBROWDOWN2_SSE2) #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
ScaleARGBRowDown2Box_SSE2); ScaleARGBRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
ScaleARGBRowDown2Box_SSE2);
}
} }
#endif #endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON) #if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
ScaleARGBRowDown2_NEON; (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
ScaleARGBRowDown2Box_Any_NEON);
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
(filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
ScaleARGBRowDown2Box_NEON);
}
} }
#endif #endif
@@ -86,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
int x, int dx, int y, int dy) { int x, int dx, int y, int dy) {
int j; int j;
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 2 * 4 + 15) & ~15; const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16); int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
@@ -96,15 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
assert(dx == 65536 * 4); // Test scale factor of 4. assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEARGBROWDOWN2_SSE2) #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
}
} }
#endif #endif
#if defined(HAS_SCALEARGBROWDOWN2_NEON) #if defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
}
} }
#endif #endif
for (j = 0; j < dst_height; ++j) { for (j = 0; j < dst_height; ++j) {
ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
@@ -135,15 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height,
assert(IS_ALIGNED(src_height, 2)); assert(IS_ALIGNED(src_height, 2));
src_argb += (y >> 16) * src_stride + (x >> 16) * 4; src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
ScaleARGBRowDownEven_SSE2; ScaleARGBRowDownEven_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
ScaleARGBRowDownEven_SSE2;
}
} }
#endif #endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
ScaleARGBRowDownEven_NEON; ScaleARGBRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
ScaleARGBRowDownEven_NEON;
}
} }
#endif #endif
@@ -229,6 +255,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
} }
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif #endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB. // Allocate a row of ARGB.
@@ -321,10 +355,26 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
} }
#endif #endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2) #if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2; ScaleARGBFilterCols = ScaleARGBCols_SSE2;
} }
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif #endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) { if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C; ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -344,7 +394,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
const uint8* src = src_argb + yi * src_stride; const uint8* src = src_argb + yi * src_stride;
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 15) & ~15; const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
uint8* rowptr = row; uint8* rowptr = row;
@@ -495,10 +545,26 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
} }
#endif #endif
#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
if (filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
}
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2) #if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2; ScaleARGBFilterCols = ScaleARGBCols_SSE2;
} }
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBFilterCols = ScaleARGBCols_NEON;
}
}
#endif #endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) { if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C; ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -521,7 +587,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
const uint8* src_row_v = src_v + uv_yi * src_stride_v; const uint8* src_row_v = src_v + uv_yi * src_stride_v;
// Allocate 2 rows of ARGB. // Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 15) & ~15; const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2); align_buffer_64(row, kRowSize * 2);
// Allocate 1 row of ARGB for source conversion. // Allocate 1 row of ARGB for source conversion.
@@ -606,6 +672,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBCols = ScaleARGBCols_SSE2; ScaleARGBCols = ScaleARGBCols_SSE2;
} }
#endif
#if defined(HAS_SCALEARGBCOLS_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleARGBCols = ScaleARGBCols_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleARGBCols = ScaleARGBCols_NEON;
}
}
#endif #endif
if (src_width * 2 == dst_width && x < 0x8000) { if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C; ScaleARGBCols = ScaleARGBColsUp2_C;
@@ -744,6 +818,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
if (!src_argb || src_width == 0 || src_height == 0 || if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 || !dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 || clip_x < 0 || clip_y < 0 ||
clip_width > 32768 || clip_height > 32768 ||
(clip_x + clip_width) > dst_width || (clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) { (clip_y + clip_height) > dst_height) {
return -1; return -1;
@@ -762,6 +837,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
int dst_width, int dst_height, int dst_width, int dst_height,
enum FilterMode filtering) { enum FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 || if (!src_argb || src_width == 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 ||
!dst_argb || dst_width <= 0 || dst_height <= 0) { !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }

View File

@@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
} }
} }
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
uint16* dst_ptr, int src_width, int src_height) {
int x; int x;
assert(src_width > 0); assert(src_width > 0);
assert(src_height > 0); for (x = 0; x < src_width - 1; x += 2) {
for (x = 0; x < src_width; ++x) { dst_ptr[0] += src_ptr[0];
const uint8* s = src_ptr + x; dst_ptr[1] += src_ptr[1];
unsigned int sum = 0u; src_ptr += 2;
int y; dst_ptr += 2;
for (y = 0; y < src_height; ++y) { }
sum += s[0]; if (src_width & 1) {
s += src_stride; dst_ptr[0] += src_ptr[0];
}
// TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
dst_ptr[x] = sum < 65535u ? sum : 65535u;
} }
} }
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
uint32* dst_ptr, int src_width, int src_height) {
int x; int x;
assert(src_width > 0); assert(src_width > 0);
assert(src_height > 0); for (x = 0; x < src_width - 1; x += 2) {
for (x = 0; x < src_width; ++x) { dst_ptr[0] += src_ptr[0];
const uint16* s = src_ptr + x; dst_ptr[1] += src_ptr[1];
unsigned int sum = 0u; src_ptr += 2;
int y; dst_ptr += 2;
for (y = 0; y < src_height; ++y) { }
sum += s[0]; if (src_width & 1) {
s += src_stride; dst_ptr[0] += src_ptr[0];
}
// No risk of overflow here now
dst_ptr[x] = sum;
} }
} }
@@ -1030,10 +1022,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
filtering = kFilterBilinear; filtering = kFilterBilinear;
} }
// If scaling to larger, switch from Box to Bilinear.
if (dst_width >= src_width || dst_height >= src_height) {
filtering = kFilterBilinear;
}
} }
if (filtering == kFilterBilinear) { if (filtering == kFilterBilinear) {
if (src_height == 1) { if (src_height == 1) {

View File

@@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
); );
} }
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) { uint16* dst_ptr, int src_width, int src_height) {
int tmp_height = 0; int tmp_height = 0;
intptr_t tmp_src = 0; intptr_t tmp_src = 0;
asm volatile ( asm volatile (
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n" "pxor %%xmm4,%%xmm4 \n"
"sub $0x1,%5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(3) ",%%xmm2 \n"
"mov %0,%3 \n" "add %6,%3 \n"
"add %6,%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"mov %5,%2 \n"
"test %2,%2 \n"
"je 3f \n"
LABELALIGN
"2: \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"add %6,%0 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n" "punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n" "punpckhbw %%xmm4,%%xmm3 \n"
"paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm2,%%xmm0 \n"
"paddusw %%xmm3,%%xmm1 \n" "paddusw %%xmm3,%%xmm1 \n"
"sub $0x1,%2 \n" "sub $0x1,%2 \n"
"jg 2b \n" "jg 1b \n"
LABELALIGN
"3: \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x10,3) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
"mov %0,%3 \n" // row pointer
"mov %5,%2 \n" // height
"pxor %%xmm0,%%xmm0 \n" // clear accumulators
"pxor %%xmm1,%%xmm1 \n"
"sub $0x10,%4 \n" "sub $0x10,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
@@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned. // Alignment requirement: dst_argb 16 byte aligned.
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx, uint8* dst_argb, int dst_width) {
uint8* dst_argb, int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12 = 0; intptr_t src_stepx_x12 = 0;
asm volatile ( asm volatile (

View File

@@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
"subs %2, %2, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // add adjacent
"vpaddl.u8 q1, q1 \n"
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "q0", "q1" // Clobber List
);
}
// Read 32x2 average down and write 16x1. // Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
@@ -517,6 +541,112 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
); );
} }
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL;
asm volatile (
".p2align 2 \n"
"1: \n"
"mov %0, %1 \n"
"mov r12, %5 \n"
"veor q2, q2, q2 \n"
"veor q3, q3, q3 \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"vld1.8 {q0}, [%0], %3 \n"
"vaddw.u8 q3, q3, d1 \n"
"vaddw.u8 q2, q2, d0 \n"
"subs r12, r12, #1 \n"
"bgt 2b \n"
MEMACCESS(2)
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
"add %1, %1, #16 \n"
"subs %4, %4, #16 \n" // 16 processed per loop
"bgt 1b \n"
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
:
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q3, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q1, q1, q0 \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"vadd.s32 q2, q1, q3 \n"
"vshl.i32 q0, q3, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"vmov q10, q1 \n"
"vmov q11, q2 \n"
"vuzp.16 q10, q11 \n"
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vsubl.s16 q11, d18, d16 \n"
"vsubl.s16 q12, d19, d17 \n"
"vmovl.u16 q13, d20 \n"
"vmovl.u16 q10, d21 \n"
"vmul.s32 q11, q11, q13 \n"
"vmul.s32 q12, q12, q10 \n"
"vshrn.s32 d18, q11, #16 \n"
"vshrn.s32 d19, q12, #16 \n"
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n"
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3",
"q8", "q9", "q10", "q11", "q12", "q13"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1 // 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
@@ -640,6 +770,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #1 \n"
"vrshrn.u16 d2, q2, #1 \n"
"vrshrn.u16 d3, q3, #1 \n"
MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
@@ -757,6 +916,119 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(dn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld1.32 {"#dn"["#n"]}, [%6] \n"
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int tmp = 0;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"1: \n"
LOAD1_DATA32_LANE(d0, 0)
LOAD1_DATA32_LANE(d0, 1)
LOAD1_DATA32_LANE(d1, 0)
LOAD1_DATA32_LANE(d1, 1)
LOAD1_DATA32_LANE(d2, 0)
LOAD1_DATA32_LANE(d2, 1)
LOAD1_DATA32_LANE(d3, 0)
LOAD1_DATA32_LANE(d3, 1)
MEMACCESS(0)
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
"subs %2, %2, #8 \n" // 8 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1"
);
}
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(dn1, dn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
asm volatile (
".p2align 2 \n"
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
"vshl.i32 q9, q1, #2 \n" // 4 * dx
"vmul.s32 q1, q1, q2 \n"
"vmov.i8 q3, #0x7f \n" // 0x7F
"vmov.i16 q15, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"vadd.s32 q8, q1, q0 \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(d0, d2, 0)
LOAD2_DATA32_LANE(d0, d2, 1)
LOAD2_DATA32_LANE(d1, d3, 0)
LOAD2_DATA32_LANE(d1, d3, 1)
"vshrn.i32 d22, q8, #9 \n"
"vand.16 d22, d22, d30 \n"
"vdup.8 d24, d22[0] \n"
"vdup.8 d25, d22[2] \n"
"vdup.8 d26, d22[4] \n"
"vdup.8 d27, d22[6] \n"
"vext.8 d4, d24, d25, #4 \n"
"vext.8 d5, d26, d27, #4 \n" // f
"veor.8 q10, q2, q3 \n" // 0x7f ^ f
"vmull.u8 q11, d0, d20 \n"
"vmull.u8 q12, d1, d21 \n"
"vmull.u8 q13, d2, d4 \n"
"vmull.u8 q14, d3, d5 \n"
"vadd.i16 q11, q11, q13 \n"
"vadd.i16 q12, q12, q14 \n"
"vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n"
MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop
"bgt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
"+r"(x), // %3
"+r"(dx), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
#undef LOAD2_DATA32_LANE
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
MEMACCESS(0) MEMACCESS(0)
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n" "b.gt 1b \n"
@@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
// Read 32x1 average down and write 16x1.
void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
"subs %w2, %w2, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // add adjacent
"uaddlp v1.8h, v1.16b \n"
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn2 v0.16b, v1.8h, #1 \n"
MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
"+r"(dst_width) // %2
:
: "v0", "v1" // Clobber List
);
}
// Read 32x2 average down and write 16x1. // Read 32x2 average down and write 16x1.
void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
@@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
MEMACCESS(1) MEMACCESS(1)
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop "subs %w3, %w3, #16 \n" // 16 processed per loop
"uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
"uaddlp v1.8h, v1.16b \n" "uaddlp v1.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
@@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS(1) MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
@@ -103,7 +126,7 @@ asm volatile (
"ld1 {v2.16b}, [%3], #16 \n" "ld1 {v2.16b}, [%3], #16 \n"
MEMACCESS(5) MEMACCESS(5)
"ld1 {v3.16b}, [%4], #16 \n" "ld1 {v3.16b}, [%4], #16 \n"
"subs %5, %5, #4 \n" "subs %w5, %w5, #4 \n"
"uaddlp v0.8h, v0.16b \n" "uaddlp v0.8h, v0.16b \n"
"uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v1.16b \n"
"uadalp v0.8h, v2.16b \n" "uadalp v0.8h, v2.16b \n"
@@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %2, %2, #24 \n" "subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
MEMACCESS(1) MEMACCESS(1)
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
@@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3) MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n" "subs %w2, %w2, #24 \n"
// filter src line 0 with src line 1 // filter src line 0 with src line 1
// expand chars to shorts to allow for room // expand chars to shorts to allow for room
@@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
MEMACCESS(3) MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
"subs %2, %2, #24 \n" "subs %w2, %w2, #24 \n"
// average src line 0 with src line 1 // average src line 0 with src line 1
"urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v0.8b, v0.8b, v4.8b \n"
"urhadd v1.8b, v1.8b, v5.8b \n" "urhadd v1.8b, v1.8b, v5.8b \n"
@@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.16b,v1.16b}, [%0], #32 \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %2, %2, #12 \n" "subs %w2, %w2, #12 \n"
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
MEMACCESS(1) MEMACCESS(1)
"st1 {v2.8b}, [%1], #8 \n" "st1 {v2.8b}, [%1], #8 \n"
@@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
MEMACCESS(4) MEMACCESS(4)
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %4, %4, #12 \n" "subs %w4, %w4, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
MEMACCESS(3) MEMACCESS(3)
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"subs %3, %3, #12 \n" "subs %w3, %w3, #12 \n"
// Shuffle the input data around to get align the data // Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
@@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
); );
} }
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width, int src_height) {
const uint8* src_tmp = NULL;
asm volatile (
"1: \n"
"mov %0, %1 \n"
"mov w12, %w5 \n"
"eor v2.16b, v2.16b, v2.16b \n"
"eor v3.16b, v3.16b, v3.16b \n"
"2: \n"
// load 16 pixels into q0
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n"
"uaddw2 v3.8h, v3.8h, v0.16b \n"
"uaddw v2.8h, v2.8h, v0.8b \n"
"subs w12, w12, #1 \n"
"b.gt 2b \n"
MEMACCESS(2)
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
"add %1, %1, #16 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"b.gt 1b \n"
: "+r"(src_tmp), // %0
"+r"(src_ptr), // %1
"+r"(dst_ptr), // %2
"+r"(src_stride), // %3
"+r"(src_width), // %4
"+r"(src_height) // %5
:
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {v4.b, v5.b}["#n"], [%6] \n"
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_ptr;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"add v2.4s, v1.4s, v3.4s \n"
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"mov v6.16b, v1.16b \n"
"mov v7.16b, v2.16b \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"ushll v4.8h, v4.8b, #0 \n"
"ushll v5.8h, v5.8b, #0 \n"
"ssubl v16.4s, v5.4h, v4.4h \n"
"ssubl2 v17.4s, v5.8h, v4.8h \n"
"ushll v7.4s, v6.4h, #0 \n"
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"shrn v6.4h, v16.4s, #16 \n"
"shrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3",
"v4", "v5", "v6", "v7", "v16", "v17"
);
}
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1 // 16x2 -> 16x1
void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleFilterRows_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
int y_fraction = 256 - source_y_fraction; int y_fraction = 256 - source_y_fraction;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %w4, #0 \n"
"b.eq 100f \n" "b.eq 100f \n"
"add %2, %2, %1 \n" "add %2, %2, %1 \n"
"cmp %4, #64 \n" "cmp %w4, #64 \n"
"b.eq 75f \n" "b.eq 75f \n"
"cmp %4, #128 \n" "cmp %w4, #128 \n"
"b.eq 50f \n" "b.eq 50f \n"
"cmp %4, #192 \n" "cmp %w4, #192 \n"
"b.eq 25f \n" "b.eq 25f \n"
"dup v5.8b, %w4 \n" "dup v5.8b, %w4 \n"
@@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"umull v6.8h, v0.8b, v4.8b \n" "umull v6.8h, v0.8b, v4.8b \n"
"umull2 v7.8h, v0.16b, v4.16b \n" "umull2 v7.8h, v0.16b, v4.16b \n"
"umlal v6.8h, v1.8b, v5.8b \n" "umlal v6.8h, v1.8b, v5.8b \n"
@@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
@@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n" "ld1 {v1.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
@@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2) MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n" "ld1 {v0.16b}, [%2], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0) MEMACCESS(0)
@@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%1], #16 \n"
"subs %3, %3, #16 \n" "subs %w3, %w3, #16 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 100b \n" "b.gt 100b \n"
@@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"ld2 {v0.4s, v1.4s}, [%0], #32 \n" "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
MEMACCESS (0) MEMACCESS (0)
"ld2 {v2.4s, v3.4s}, [%0], #32 \n" "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
"subs %2, %2, #8 \n" // 8 processed per loop "subs %w2, %w2, #8 \n" // 8 processed per loop
MEMACCESS (1) MEMACCESS (1)
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
MEMACCESS (1) MEMACCESS (1)
@@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
); );
} }
void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) {
asm volatile (
"1: \n"
MEMACCESS (0)
// load 8 ARGB pixels.
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
"rshrn v1.8b, v1.8h, #1 \n"
"rshrn v2.8b, v2.8h, #1 \n"
"rshrn v3.8b, v3.8h, #1 \n"
MEMACCESS (1)
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
);
}
void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) { uint8* dst, int dst_width) {
asm volatile ( asm volatile (
@@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"1: \n" "1: \n"
MEMACCESS (0) MEMACCESS (0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
@@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.s}[3], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop. "subs %w2, %w2, #4 \n" // 4 pixels per loop.
MEMACCESS(1) MEMACCESS(1)
"st1 {v0.16b}, [%1], #16 \n" "st1 {v0.16b}, [%1], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3 : "r"((int64)(src_stepx * 4)) // %3
: "memory", "cc", "v0" : "memory", "cc", "v0"
); );
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned. // Alignment requirement: src_argb 4 byte aligned.
// TODO, might be worth another optimization pass in future. // TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with. // It could be upgraded to 8 pixels at a time to start with.
void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
@@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
MEMACCESS(1) MEMACCESS(1)
"ld1 {v1.8b}, [%1], %4 \n" "ld1 {v1.8b}, [%1], %4 \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v2.8b}, [%0], %4 \n" "ld1 {v2.8b}, [%0], %4 \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v3.8b}, [%1], %4 \n" "ld1 {v3.8b}, [%1], %4 \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v4.8b}, [%0], %4 \n" "ld1 {v4.8b}, [%0], %4 \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v5.8b}, [%1], %4 \n" "ld1 {v5.8b}, [%1], %4 \n"
MEMACCESS(0) MEMACCESS(0)
"ld1 {v6.8b}, [%0], %4 \n" "ld1 {v6.8b}, [%0], %4 \n"
MEMACCESS(1) MEMACCESS(1)
"ld1 {v7.8b}, [%1], %4 \n" "ld1 {v7.8b}, [%1], %4 \n"
"uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v0.8h, v0.8b, v1.8b \n"
"uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n"
"uaddl v4.8h, v4.8b, v5.8b \n" "uaddl v4.8h, v4.8b, v5.8b \n"
"uaddl v6.8h, v6.8b, v7.8b \n" "uaddl v6.8h, v6.8b, v7.8b \n"
"mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
"mov v0.d[1], v2.d[0] \n" "mov v0.d[1], v2.d[0] \n"
"mov v2.d[0], v16.d[1] \n" "mov v2.d[0], v16.d[1] \n"
"mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
"mov v4.d[1], v6.d[0] \n" "mov v4.d[1], v6.d[0] \n"
"mov v6.d[0], v16.d[1] \n" "mov v6.d[0], v16.d[1] \n"
"add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
"add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
"rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop. "subs %w3, %w3, #4 \n" // 4 pixels per loop.
MEMACCESS(2) MEMACCESS(2)
"st1 {v0.16b}, [%2], #16 \n" "st1 {v0.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
@@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"+r"(src_stride), // %1 "+r"(src_stride), // %1
"+r"(dst_argb), // %2 "+r"(dst_argb), // %2
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: "r"(src_stepx * 4) // %4 : "r"((int64)(src_stepx * 4)) // %4
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD1_DATA32_LANE(vn, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld1 {"#vn".s}["#n"], [%6] \n"
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
int64 tmp64 = 0;
asm volatile (
"1: \n"
LOAD1_DATA32_LANE(v0, 0)
LOAD1_DATA32_LANE(v0, 1)
LOAD1_DATA32_LANE(v0, 2)
LOAD1_DATA32_LANE(v0, 3)
LOAD1_DATA32_LANE(v1, 0)
LOAD1_DATA32_LANE(v1, 1)
LOAD1_DATA32_LANE(v1, 2)
LOAD1_DATA32_LANE(v1, 3)
MEMACCESS(0)
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp64), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1"
);
}
#undef LOAD1_DATA32_LANE
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA32_LANE(vn1, vn2, n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5, lsl #2 \n" \
"add %3, %3, %4 \n" \
MEMACCESS(6) \
"ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n"
void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8* src_tmp = src_argb;
int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning.
int64 x64 = (int64) x;
int64 dx64 = (int64) dx;
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v6.4s, v1.4s, #2 \n" // 4 * dx
"mul v1.4s, v1.4s, v2.4s \n"
"movi v3.16b, #0x7f \n" // 0x7F
"movi v4.8h, #0x7f \n" // 0x7F
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v5.4s, v1.4s, v0.4s \n"
"1: \n"
// d0, d1: a
// d2, d3: b
LOAD2_DATA32_LANE(v0, v1, 0)
LOAD2_DATA32_LANE(v0, v1, 1)
LOAD2_DATA32_LANE(v0, v1, 2)
LOAD2_DATA32_LANE(v0, v1, 3)
"shrn v2.4h, v5.4s, #9 \n"
"and v2.8b, v2.8b, v4.8b \n"
"dup v16.8b, v2.b[0] \n"
"dup v17.8b, v2.b[2] \n"
"dup v18.8b, v2.b[4] \n"
"dup v19.8b, v2.b[6] \n"
"ext v2.8b, v16.8b, v17.8b, #4 \n"
"ext v17.8b, v18.8b, v19.8b, #4 \n"
"ins v2.d[1], v17.d[0] \n" // f
"eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
"umull v16.8h, v0.8b, v7.8b \n"
"umull2 v17.8h, v0.16b, v7.16b \n"
"umull v18.8h, v1.8b, v2.8b \n"
"umull2 v19.8h, v1.16b, v2.16b \n"
"add v16.8h, v16.8h, v18.8h \n"
"add v17.8h, v17.8h, v19.8h \n"
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width64), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v16", "v17", "v18", "v19"
);
}
#undef LOAD2_DATA32_LANE
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -9,6 +9,7 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
@@ -16,7 +17,8 @@ extern "C" {
#endif #endif
// This module is for Visual C x86. // This module is for Visual C x86.
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
defined(_MSC_VER) && !defined(__clang__)
// Offsets for source bytes 0 to 9 // Offsets for source bytes 0 to 9
static uvec8 kShuf0 = static uvec8 kShuf0 =
@@ -93,8 +95,7 @@ static uvec16 kScaleAb2 =
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
// Reads 32 pixels, throws half away and writes 16 pixels. // Reads 32 pixels, throws half away and writes 16 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x1 rectangle to 16x1. // Blends 32x1 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x2 rectangle to 16x1. // Blends 32x2 rectangle to 16x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
#ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
__declspec(naked)
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x1 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x2 rectangle to 32x1.
__declspec(naked)
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_ptr
mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width
vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b
vpsrlw ymm4, ymm4, 15
vpackuswb ymm4, ymm4, ymm4
vpxor ymm5, ymm5, ymm5 // constant 0
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
vpmaddubsw ymm1, ymm1, ymm4
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
vpavgw ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
pop esi
vzeroupper
ret
}
}
#endif // HAS_SCALEROWDOWN2_AVX2
// Point samples 32 pixels to 8 pixels. // Point samples 32 pixels to 8 pixels.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Blends 32x4 rectangle to 8x1. // Blends 32x4 rectangle to 8x1.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm7, 8 psrlw xmm7, 8
wloop: wloop:
movdqu xmm0, [eax] movdqu xmm0, [eax] // average rows
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + esi] movdqu xmm2, [eax + esi]
movdqu xmm3, [eax + esi + 16] movdqu xmm3, [eax + esi + 16]
pavgb xmm0, xmm2 // average rows pavgb xmm0, xmm2
pavgb xmm1, xmm3 pavgb xmm1, xmm3
movdqu xmm2, [eax + esi * 2] movdqu xmm2, [eax + esi * 2]
movdqu xmm3, [eax + esi * 2 + 16] movdqu xmm3, [eax + esi * 2 + 16]
@@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
} }
#ifdef HAS_SCALEROWDOWN4_AVX2
// Point samples 64 pixels to 16 pixels.
__declspec(naked)
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000
vpsrld ymm5, ymm5, 24
vpslld ymm5, ymm5, 16
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpand ymm0, ymm0, ymm5
vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
vzeroupper
ret
}
}
// Blends 64x4 rectangle to 16x1.
__declspec(naked)
void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_ptr
mov esi, [esp + 8 + 8] // src_stride
mov edx, [esp + 8 + 12] // dst_ptr
mov ecx, [esp + 8 + 16] // dst_width
lea edi, [esi + esi * 2] // src_stride * 3
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
vpsrlw ymm7, ymm7, 8
wloop:
vmovdqu ymm0, [eax] // average rows
vmovdqu ymm1, [eax + 32]
vpavgb ymm0, ymm0, [eax + esi]
vpavgb ymm1, ymm1, [eax + esi + 32]
vmovdqu ymm2, [eax + esi * 2]
vmovdqu ymm3, [eax + esi * 2 + 32]
vpavgb ymm2, ymm2, [eax + edi]
vpavgb ymm3, ymm3, [eax + edi + 32]
lea eax, [eax + 64]
vpavgb ymm0, ymm0, ymm2
vpavgb ymm1, ymm1, ymm3
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
vpand ymm3, ymm1, ymm7
vpsrlw ymm0, ymm0, 8
vpsrlw ymm1, ymm1, 8
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
vpsrlw ymm0, ymm0, 8
vpavgw ymm0, ymm0, ymm2
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 16
jg wloop
pop edi
pop esi
vzeroupper
ret
}
}
#endif // HAS_SCALEROWDOWN4_AVX2
// Point samples 32 pixels to 24 pixels. // Point samples 32 pixels to 24 pixels.
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling. // Then shuffled to do the scaling.
// Note that movdqa+palign may be better than movdqu. __declspec(naked)
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
__declspec(naked) __declspec(align(16))
void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
// xmm7 kRound34 // xmm7 kRound34
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
@@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
} }
// Note that movdqa+palign may be better than movdqu. // Note that movdqa+palign may be better than movdqu.
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
@@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
// 3/8 point sampler // 3/8 point sampler
// Scale 32 pixels to 12 // Scale 32 pixels to 12
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
__asm { __asm {
@@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
} }
// Scale 16x3 pixels to 6x1 with interpolation // Scale 16x3 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
@@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
} }
// Scale 16x2 pixels to 6x1 with interpolation // Scale 16x2 pixels to 6x1 with interpolation
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) { uint8* dst_ptr, int dst_width) {
@@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
} }
} }
// Reads 16xN bytes and produces 16 shorts at a time. // Reads 16 bytes and accumulates to 16 shorts at a time.
// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. __declspec(naked)
__declspec(naked) __declspec(align(16)) void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint16* dst_ptr, int src_width,
int src_height) {
__asm { __asm {
push esi mov eax, [esp + 4] // src_ptr
push edi mov edx, [esp + 8] // dst_ptr
push ebx mov ecx, [esp + 12] // src_width
push ebp pxor xmm5, xmm5
mov esi, [esp + 16 + 4] // src_ptr
mov edx, [esp + 16 + 8] // src_stride
mov edi, [esp + 16 + 12] // dst_ptr
mov ecx, [esp + 16 + 16] // dst_width
mov ebx, [esp + 16 + 20] // height
pxor xmm4, xmm4
dec ebx
// sum rows
xloop: xloop:
// first row movdqu xmm3, [eax] // read 16 bytes
movdqu xmm0, [esi] lea eax, [eax + 16]
lea eax, [esi + edx] movdqu xmm0, [edx] // read 16 words from destination
movdqa xmm1, xmm0 movdqu xmm1, [edx + 16]
punpcklbw xmm0, xmm4 movdqa xmm2, xmm3
punpckhbw xmm1, xmm4 punpcklbw xmm2, xmm5
lea esi, [esi + 16] punpckhbw xmm3, xmm5
mov ebp, ebx
test ebp, ebp
je ydone
// sum remaining rows
yloop:
movdqu xmm2, [eax] // read 16 pixels
lea eax, [eax + edx] // advance to next row
movdqa xmm3, xmm2
punpcklbw xmm2, xmm4
punpckhbw xmm3, xmm4
paddusw xmm0, xmm2 // sum 16 words paddusw xmm0, xmm2 // sum 16 words
paddusw xmm1, xmm3 paddusw xmm1, xmm3
sub ebp, 1 movdqu [edx], xmm0 // write 16 words to destination
jg yloop movdqu [edx + 16], xmm1
lea edx, [edx + 32]
ydone:
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
lea edi, [edi + 32]
sub ecx, 16 sub ecx, 16
jg xloop jg xloop
pop ebp
pop ebx
pop edi
pop esi
ret ret
} }
} }
// Bilinear column filtering. SSSE3 version. #ifdef HAS_SCALEADDROW_AVX2
// TODO(fbarchard): Port to Neon // Reads 32 bytes and accumulates to 32 shorts at a time.
// TODO(fbarchard): Switch the following: __declspec(naked)
// xor ebx, ebx void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
// mov bx, word ptr [esi + eax] // 2 source x0 pixels __asm {
// To mov eax, [esp + 4] // src_ptr
// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels mov edx, [esp + 8] // dst_ptr
// when drmemory bug fixed. mov ecx, [esp + 12] // src_width
// https://code.google.com/p/drmemory/issues/detail?id=1396 vpxor ymm5, ymm5, ymm5
__declspec(naked) __declspec(align(16)) // sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck
vpunpcklbw ymm2, ymm3, ymm5
vpunpckhbw ymm3, ymm3, ymm5
vpaddusw ymm0, ymm2, [edx] // sum 16 words
vpaddusw ymm1, ymm3, [edx + 32]
vmovdqu [edx], ymm0 // write 32 words to destination
vmovdqu [edx + 32], ymm1
lea edx, [edx + 64]
sub ecx, 32
jg xloop
vzeroupper
ret
}
}
#endif // HAS_SCALEADDROW_AVX2
// Bilinear column filtering. SSSE3 version.
__declspec(naked)
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
@@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
} }
// Reads 16 pixels, duplicates them and writes 32 pixels. // Reads 16 pixels, duplicates them and writes 32 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
@@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
} }
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBRowDown2_SSE2(const uint8* src_argb, void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
@@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
} }
// Blends 8x1 rectangle to 4x1. // Blends 8x1 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
@@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
} }
// Blends 8x2 rectangle to 4x1. // Blends 8x2 rectangle to 4x1.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
@@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
} }
// Reads 4 pixels at a time. // Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
uint8* dst_argb, int dst_width) { uint8* dst_argb, int dst_width) {
@@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
} }
// Blends four 2x2 to 4x1. // Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int src_stepx, int src_stepx,
@@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
} }
// Column scaling unfiltered. SSE2 version. // Column scaling unfiltered. SSE2 version.
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
@@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
}; };
__declspec(naked) __declspec(align(16)) __declspec(naked)
void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
@@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
} }
// Reads 4 pixels, duplicates them and writes 8 pixels. // Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked)
__declspec(naked) __declspec(align(16))
void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
int dst_width, int x, int dx) { int dst_width, int x, int dx) {
__asm { __asm {
@@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
} }
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16)) __declspec(naked)
int FixedDiv_X86(int num, int div) { int FixedDiv_X86(int num, int div) {
__asm { __asm {
mov eax, [esp + 4] // num mov eax, [esp + 4] // num
@@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) {
} }
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
__declspec(naked) __declspec(align(16)) __declspec(naked)
int FixedDiv1_X86(int num, int div) { int FixedDiv1_X86(int num, int div) {
__asm { __asm {
mov eax, [esp + 4] // num mov eax, [esp + 4] // num
@@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) {
ret ret
} }
} }
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"